diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/character.rs | 21 | ||||
| -rw-r--r-- | src/config.rs | 63 | ||||
| -rw-r--r-- | src/dl_list.rs | 13 | ||||
| -rw-r--r-- | src/main.rs | 36 | ||||
| -rw-r--r-- | src/pre_process.rs | 34 | ||||
| -rw-r--r-- | src/section.rs | 34 | ||||
| -rw-r--r-- | src/traits.rs | 13 |
7 files changed, 87 insertions, 127 deletions
diff --git a/src/character.rs b/src/character.rs index 98c4850..b82ac18 100644 --- a/src/character.rs +++ b/src/character.rs | |||
| @@ -1,5 +1,8 @@ | |||
| 1 | use super::tags::Tag; | 1 | use super::tags::Tag; |
| 2 | use super::traits::Trait; | 2 | use super::dl_list::DLListItem; |
| 3 | use super::section::Section; | ||
| 4 | |||
| 5 | use std::collections::HashMap; | ||
| 3 | 6 | ||
| 4 | #[derive(Debug)] | 7 | #[derive(Debug)] |
| 5 | pub struct Names { | 8 | pub struct Names { |
| @@ -16,8 +19,8 @@ pub struct Images { | |||
| 16 | 19 | ||
| 17 | #[derive(Debug)] | 20 | #[derive(Debug)] |
| 18 | pub struct Traits { | 21 | pub struct Traits { |
| 19 | pub official: Vec<Trait>, | 22 | pub official: Vec<DLListItem>, |
| 20 | pub indexed: Vec<Trait>, | 23 | pub indexed: Vec<DLListItem>, |
| 21 | } | 24 | } |
| 22 | 25 | ||
| 23 | #[derive(Debug)] | 26 | #[derive(Debug)] |
| @@ -26,6 +29,18 @@ pub struct Character { | |||
| 26 | pub image: Images, | 29 | pub image: Images, |
| 27 | pub tags: Vec<Tag>, | 30 | pub tags: Vec<Tag>, |
| 28 | pub traits: Traits, | 31 | pub traits: Traits, |
| 32 | pub extra: Vec<DLListItem>, | ||
| 29 | 33 | ||
| 30 | pub role: Option<String> | 34 | pub role: Option<String> |
| 31 | } | 35 | } |
| 36 | |||
| 37 | pub fn get_sections() -> HashMap<String, Section> { | ||
| 38 | let mut s: HashMap<String, Section> = HashMap::new(); | ||
| 39 | |||
| 40 | s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"])); | ||
| 41 | s.insert("misc".into(), Section::new("misc", r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#, vec!["role"])); | ||
| 42 | s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"])); | ||
| 43 | s.insert("tags".into(), Section::new("tags", r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#, vec!["tags_raw"])); | ||
| 44 | s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"])); | ||
| 45 | s | ||
| 46 | } | ||
diff --git a/src/config.rs b/src/config.rs deleted file mode 100644 index f491852..0000000 --- a/src/config.rs +++ /dev/null | |||
| @@ -1,63 +0,0 @@ | |||
| 1 | extern crate yaml_rust; | ||
| 2 | use self::yaml_rust::YamlLoader; | ||
| 3 | |||
| 4 | use std::collections::HashMap; | ||
| 5 | use std::fs::File; | ||
| 6 | use std::io::prelude::*; | ||
| 7 | |||
| 8 | pub struct SectionConfig { | ||
| 9 | pub pattern: String, | ||
| 10 | pub groups: Vec<String>, | ||
| 11 | } | ||
| 12 | |||
| 13 | pub struct Config { | ||
| 14 | pub sections: HashMap<String, SectionConfig>, | ||
| 15 | } | ||
| 16 | |||
| 17 | impl Config { | ||
| 18 | pub fn from_file(p: &str, expected: Vec<&'static str>) -> Self { | ||
| 19 | let mut f = File::open(p).unwrap(); | ||
| 20 | let mut buf = String::new(); | ||
| 21 | f.read_to_string(&mut buf).unwrap(); | ||
| 22 | let docs = YamlLoader::load_from_str(&buf).unwrap(); | ||
| 23 | |||
| 24 | let doc = &docs[0]; | ||
| 25 | |||
| 26 | let mut sections: HashMap<String, SectionConfig> = HashMap::new(); | ||
| 27 | for (name, entry) in doc["sections"].as_hash().unwrap() { | ||
| 28 | sections.insert(name.as_str().unwrap().into(), | ||
| 29 | SectionConfig { | ||
| 30 | pattern: entry["pattern"].as_str().unwrap().into(), | ||
| 31 | groups: entry["groups"] | ||
| 32 | .as_vec() | ||
| 33 | .unwrap() | ||
| 34 | .into_iter() | ||
| 35 | .map(|v| v.as_str().unwrap().into()) | ||
| 36 | .collect(), | ||
| 37 | }); | ||
| 38 | } | ||
| 39 | |||
| 40 | |||
| 41 | for ex in &expected { | ||
| 42 | if !sections.contains_key(&ex.to_string()) { | ||
| 43 | panic!("config: section '{}' not found", ex); | ||
| 44 | } | ||
| 45 | } | ||
| 46 | |||
| 47 | { | ||
| 48 | let traits = §ions["traits"]; | ||
| 49 | if !traits.groups.contains(&"indexed_raw".to_string()) { | ||
| 50 | panic!("config: no group 'indexed_raw' found in section 'traits'"); | ||
| 51 | } | ||
| 52 | if !traits.groups.contains(&"official_raw".to_string()) { | ||
| 53 | panic!("config: no group 'official_raw' found in section 'traits'"); | ||
| 54 | } | ||
| 55 | let tags = §ions["tags"]; | ||
| 56 | if !tags.groups.contains(&"tags_raw".to_string()) { | ||
| 57 | panic!("config: no group 'tags_raw' found in section 'tags'"); | ||
| 58 | } | ||
| 59 | } | ||
| 60 | |||
| 61 | Config { sections: sections } | ||
| 62 | } | ||
| 63 | } | ||
diff --git a/src/dl_list.rs b/src/dl_list.rs new file mode 100644 index 0000000..979c332 --- /dev/null +++ b/src/dl_list.rs | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | use super::regex::Regex; | ||
| 2 | |||
| 3 | #[derive(Debug)] | ||
| 4 | pub struct DLListItem { | ||
| 5 | name: String, | ||
| 6 | value: String, | ||
| 7 | } | ||
| 8 | |||
| 9 | pub fn parse(s: &str) -> Vec<DLListItem> { | ||
| 10 | let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); | ||
| 11 | |||
| 12 | reg_list_item.captures_iter(s).map(|c| DLListItem { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() | ||
| 13 | } | ||
diff --git a/src/main.rs b/src/main.rs index ee8c3eb..e76da66 100644 --- a/src/main.rs +++ b/src/main.rs | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | extern crate regex; | 1 | extern crate regex; |
| 2 | use regex::Regex; | ||
| 2 | extern crate yaml_rust; | 3 | extern crate yaml_rust; |
| 3 | 4 | ||
| 4 | extern crate walkdir; | 5 | extern crate walkdir; |
| @@ -11,27 +12,23 @@ use std::path::Path; | |||
| 11 | use std::collections::HashMap; | 12 | use std::collections::HashMap; |
| 12 | 13 | ||
| 13 | mod pre_process; | 14 | mod pre_process; |
| 14 | use pre_process::Section; | 15 | mod section; |
| 15 | 16 | use section::Section; | |
| 16 | mod config; | ||
| 17 | use config::Config; | ||
| 18 | 17 | ||
| 19 | mod character; | 18 | mod character; |
| 20 | use character::{Images, Names, Traits, Character}; | 19 | use character::{Images, Names, Traits, Character}; |
| 21 | 20 | ||
| 22 | mod tags; | 21 | mod tags; |
| 23 | mod traits; | 22 | mod dl_list; |
| 23 | use dl_list::DLListItem; | ||
| 24 | 24 | ||
| 25 | fn main() { | 25 | fn main() { |
| 26 | let raw_files = env::var("RAW_FILES").unwrap_or("characters".into()); | 26 | let raw_files = env::var("RAW_FILES").unwrap_or("characters".into()); |
| 27 | let base_path = Path::new(&raw_files); | 27 | let base_path = Path::new(&raw_files); |
| 28 | 28 | ||
| 29 | let cfg = Config::from_file("config.yml", vec!["name", "image", "misc", "tags", "traits"]); | 29 | let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#).unwrap(); |
| 30 | 30 | ||
| 31 | let mut sections: HashMap<String, Section> = HashMap::new(); | 31 | let mut sections: HashMap<String, Section> = character::get_sections(); |
| 32 | for (name, sec) in &cfg.sections { | ||
| 33 | sections.insert(name.clone(), Section::new(&name, &sec.pattern, sec.groups.clone())); | ||
| 34 | } | ||
| 35 | 32 | ||
| 36 | for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) { | 33 | for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) { |
| 37 | let mut f = File::open(entry.path()).expect("could not open file"); | 34 | let mut f = File::open(entry.path()).expect("could not open file"); |
| @@ -43,7 +40,14 @@ fn main() { | |||
| 43 | 40 | ||
| 44 | let buf = pre_process::strip_irrelevant_content(&buf); | 41 | let buf = pre_process::strip_irrelevant_content(&buf); |
| 45 | 42 | ||
| 46 | pre_process::split_sections(&buf, &mut sections); | 43 | section::process(&buf, &mut sections); |
| 44 | |||
| 45 | // find optional extra details | ||
| 46 | let mut extra_details: Vec<DLListItem> = vec![]; | ||
| 47 | let caps = re_extras.captures(&buf); | ||
| 48 | if caps.is_some() { | ||
| 49 | extra_details = dl_list::parse(caps.unwrap().at(1).unwrap()); | ||
| 50 | } | ||
| 47 | 51 | ||
| 48 | { | 52 | { |
| 49 | let name: &Section = §ions["name".into()]; | 53 | let name: &Section = §ions["name".into()]; |
| @@ -53,7 +57,10 @@ fn main() { | |||
| 53 | name: Names { | 57 | name: Names { |
| 54 | romaji: name.data["romaji".into()].clone(), | 58 | romaji: name.data["romaji".into()].clone(), |
| 55 | japanese: name.data["japanese".into()].clone(), | 59 | japanese: name.data["japanese".into()].clone(), |
| 56 | aliases: name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(), | 60 | aliases: match name.data["aliases".into()].len() > 0 { |
| 61 | true => name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(), | ||
| 62 | false => vec![] | ||
| 63 | } | ||
| 57 | }, | 64 | }, |
| 58 | image: Images { | 65 | image: Images { |
| 59 | thumb: image.data["thumb".into()].clone(), | 66 | thumb: image.data["thumb".into()].clone(), |
| @@ -61,9 +68,10 @@ fn main() { | |||
| 61 | }, | 68 | }, |
| 62 | tags: tags::parse(&(§ions["tags".into()] as &Section).data["tags_raw".into()]), | 69 | tags: tags::parse(&(§ions["tags".into()] as &Section).data["tags_raw".into()]), |
| 63 | traits: Traits { | 70 | traits: Traits { |
| 64 | official: traits::parse(&(§ions["traits"] as &Section).data["official_raw".into()]), | 71 | official: dl_list::parse(&(§ions["traits"] as &Section).data["official_raw".into()]), |
| 65 | indexed: traits::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]), | 72 | indexed: dl_list::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]), |
| 66 | }, | 73 | }, |
| 74 | extra: extra_details, | ||
| 67 | role: match misc.data["role".into()].len() > 0 { | 75 | role: match misc.data["role".into()].len() > 0 { |
| 68 | true => Some(misc.data["role".into()].clone()), | 76 | true => Some(misc.data["role".into()].clone()), |
| 69 | false => None | 77 | false => None |
diff --git a/src/pre_process.rs b/src/pre_process.rs index 273562d..877ddc3 100644 --- a/src/pre_process.rs +++ b/src/pre_process.rs | |||
| @@ -1,7 +1,3 @@ | |||
| 1 | use super::regex::Regex; | ||
| 2 | |||
| 3 | use std::collections::HashMap; | ||
| 4 | |||
| 5 | pub fn strip_irrelevant_content(s: &str) -> String { | 1 | pub fn strip_irrelevant_content(s: &str) -> String { |
| 6 | let mut retn = ""; | 2 | let mut retn = ""; |
| 7 | match s.find(r#"<div class=profile id=profile>"#) { | 3 | match s.find(r#"<div class=profile id=profile>"#) { |
| @@ -16,34 +12,4 @@ pub fn strip_irrelevant_content(s: &str) -> String { | |||
| 16 | return retn.into(); | 12 | return retn.into(); |
| 17 | } | 13 | } |
| 18 | 14 | ||
| 19 | pub struct Section { | ||
| 20 | pub name: String, | ||
| 21 | pub re: Regex, | ||
| 22 | pub keys: Vec<String>, | ||
| 23 | pub data: HashMap<String, String>, | ||
| 24 | } | ||
| 25 | 15 | ||
| 26 | impl Section { | ||
| 27 | pub fn new(name: &str, re: &str, groups: Vec<String>) -> Self { | ||
| 28 | Section { | ||
| 29 | name: name.into(), | ||
| 30 | re: Regex::new(re).unwrap(), | ||
| 31 | keys: groups, | ||
| 32 | data: HashMap::new(), | ||
| 33 | } | ||
| 34 | } | ||
| 35 | } | ||
| 36 | |||
| 37 | pub fn split_sections(d: &str, s: &mut HashMap<String, Section>) { | ||
| 38 | for (_, section) in s { | ||
| 39 | for m in section.re.captures_iter(d) { | ||
| 40 | assert!(m.len() >= section.keys.len() + 1); | ||
| 41 | |||
| 42 | let mut idx = 0; | ||
| 43 | for key in §ion.keys { | ||
| 44 | section.data.insert(key.clone(), m.at(idx + 1).unwrap().into()); | ||
| 45 | idx += 1; | ||
| 46 | } | ||
| 47 | } | ||
| 48 | } | ||
| 49 | } | ||
diff --git a/src/section.rs b/src/section.rs new file mode 100644 index 0000000..7e492b1 --- /dev/null +++ b/src/section.rs | |||
| @@ -0,0 +1,34 @@ | |||
| 1 | use super::regex::Regex; | ||
| 2 | use std::collections::HashMap; | ||
| 3 | |||
| 4 | pub struct Section { | ||
| 5 | pub name: String, | ||
| 6 | pub re: Regex, | ||
| 7 | pub keys: Vec<String>, | ||
| 8 | pub data: HashMap<String, String>, | ||
| 9 | } | ||
| 10 | |||
| 11 | impl Section { | ||
| 12 | pub fn new(name: &str, re: &str, groups: Vec<&'static str>) -> Self { | ||
| 13 | Section { | ||
| 14 | name: name.into(), | ||
| 15 | re: Regex::new(re).unwrap(), | ||
| 16 | keys: groups.into_iter().map(|s| s.into()).collect(), | ||
| 17 | data: HashMap::new(), | ||
| 18 | } | ||
| 19 | } | ||
| 20 | } | ||
| 21 | |||
| 22 | pub fn process(d: &str, s: &mut HashMap<String, Section>) { | ||
| 23 | for (_, section) in s { | ||
| 24 | for m in section.re.captures_iter(d) { | ||
| 25 | assert!(m.len() >= section.keys.len() + 1); | ||
| 26 | |||
| 27 | let mut idx = 0; | ||
| 28 | for key in §ion.keys { | ||
| 29 | section.data.insert(key.clone(), m.at(idx + 1).unwrap().into()); | ||
| 30 | idx += 1; | ||
| 31 | } | ||
| 32 | } | ||
| 33 | } | ||
| 34 | } | ||
diff --git a/src/traits.rs b/src/traits.rs deleted file mode 100644 index db1b2eb..0000000 --- a/src/traits.rs +++ /dev/null | |||
| @@ -1,13 +0,0 @@ | |||
| 1 | use super::regex::Regex; | ||
| 2 | |||
| 3 | #[derive(Debug)] | ||
| 4 | pub struct Trait { | ||
| 5 | name: String, | ||
| 6 | value: String, | ||
| 7 | } | ||
| 8 | |||
| 9 | pub fn parse(s: &str) -> Vec<Trait> { | ||
| 10 | let reg_trait = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); | ||
| 11 | |||
| 12 | reg_trait.captures_iter(s).map(|c| Trait { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() | ||
| 13 | } | ||
