diff options
-rw-r--r-- | src/character.rs | 21 | ||||
-rw-r--r-- | src/config.rs | 63 | ||||
-rw-r--r-- | src/dl_list.rs | 13 | ||||
-rw-r--r-- | src/main.rs | 36 | ||||
-rw-r--r-- | src/pre_process.rs | 34 | ||||
-rw-r--r-- | src/section.rs | 34 | ||||
-rw-r--r-- | src/traits.rs | 13 |
7 files changed, 87 insertions, 127 deletions
diff --git a/src/character.rs b/src/character.rs index 98c4850..b82ac18 100644 --- a/src/character.rs +++ b/src/character.rs | |||
@@ -1,5 +1,8 @@ | |||
1 | use super::tags::Tag; | 1 | use super::tags::Tag; |
2 | use super::traits::Trait; | 2 | use super::dl_list::DLListItem; |
3 | use super::section::Section; | ||
4 | |||
5 | use std::collections::HashMap; | ||
3 | 6 | ||
4 | #[derive(Debug)] | 7 | #[derive(Debug)] |
5 | pub struct Names { | 8 | pub struct Names { |
@@ -16,8 +19,8 @@ pub struct Images { | |||
16 | 19 | ||
17 | #[derive(Debug)] | 20 | #[derive(Debug)] |
18 | pub struct Traits { | 21 | pub struct Traits { |
19 | pub official: Vec<Trait>, | 22 | pub official: Vec<DLListItem>, |
20 | pub indexed: Vec<Trait>, | 23 | pub indexed: Vec<DLListItem>, |
21 | } | 24 | } |
22 | 25 | ||
23 | #[derive(Debug)] | 26 | #[derive(Debug)] |
@@ -26,6 +29,18 @@ pub struct Character { | |||
26 | pub image: Images, | 29 | pub image: Images, |
27 | pub tags: Vec<Tag>, | 30 | pub tags: Vec<Tag>, |
28 | pub traits: Traits, | 31 | pub traits: Traits, |
32 | pub extra: Vec<DLListItem>, | ||
29 | 33 | ||
30 | pub role: Option<String> | 34 | pub role: Option<String> |
31 | } | 35 | } |
36 | |||
37 | pub fn get_sections() -> HashMap<String, Section> { | ||
38 | let mut s: HashMap<String, Section> = HashMap::new(); | ||
39 | |||
40 | s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"])); | ||
41 | s.insert("misc".into(), Section::new("misc", r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#, vec!["role"])); | ||
42 | s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"])); | ||
43 | s.insert("tags".into(), Section::new("tags", r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#, vec!["tags_raw"])); | ||
44 | s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"])); | ||
45 | s | ||
46 | } | ||
diff --git a/src/config.rs b/src/config.rs deleted file mode 100644 index f491852..0000000 --- a/src/config.rs +++ /dev/null | |||
@@ -1,63 +0,0 @@ | |||
1 | extern crate yaml_rust; | ||
2 | use self::yaml_rust::YamlLoader; | ||
3 | |||
4 | use std::collections::HashMap; | ||
5 | use std::fs::File; | ||
6 | use std::io::prelude::*; | ||
7 | |||
8 | pub struct SectionConfig { | ||
9 | pub pattern: String, | ||
10 | pub groups: Vec<String>, | ||
11 | } | ||
12 | |||
13 | pub struct Config { | ||
14 | pub sections: HashMap<String, SectionConfig>, | ||
15 | } | ||
16 | |||
17 | impl Config { | ||
18 | pub fn from_file(p: &str, expected: Vec<&'static str>) -> Self { | ||
19 | let mut f = File::open(p).unwrap(); | ||
20 | let mut buf = String::new(); | ||
21 | f.read_to_string(&mut buf).unwrap(); | ||
22 | let docs = YamlLoader::load_from_str(&buf).unwrap(); | ||
23 | |||
24 | let doc = &docs[0]; | ||
25 | |||
26 | let mut sections: HashMap<String, SectionConfig> = HashMap::new(); | ||
27 | for (name, entry) in doc["sections"].as_hash().unwrap() { | ||
28 | sections.insert(name.as_str().unwrap().into(), | ||
29 | SectionConfig { | ||
30 | pattern: entry["pattern"].as_str().unwrap().into(), | ||
31 | groups: entry["groups"] | ||
32 | .as_vec() | ||
33 | .unwrap() | ||
34 | .into_iter() | ||
35 | .map(|v| v.as_str().unwrap().into()) | ||
36 | .collect(), | ||
37 | }); | ||
38 | } | ||
39 | |||
40 | |||
41 | for ex in &expected { | ||
42 | if !sections.contains_key(&ex.to_string()) { | ||
43 | panic!("config: section '{}' not found", ex); | ||
44 | } | ||
45 | } | ||
46 | |||
47 | { | ||
48 | let traits = §ions["traits"]; | ||
49 | if !traits.groups.contains(&"indexed_raw".to_string()) { | ||
50 | panic!("config: no group 'indexed_raw' found in section 'traits'"); | ||
51 | } | ||
52 | if !traits.groups.contains(&"official_raw".to_string()) { | ||
53 | panic!("config: no group 'official_raw' found in section 'traits'"); | ||
54 | } | ||
55 | let tags = §ions["tags"]; | ||
56 | if !tags.groups.contains(&"tags_raw".to_string()) { | ||
57 | panic!("config: no group 'tags_raw' found in section 'tags'"); | ||
58 | } | ||
59 | } | ||
60 | |||
61 | Config { sections: sections } | ||
62 | } | ||
63 | } | ||
diff --git a/src/dl_list.rs b/src/dl_list.rs new file mode 100644 index 0000000..979c332 --- /dev/null +++ b/src/dl_list.rs | |||
@@ -0,0 +1,13 @@ | |||
1 | use super::regex::Regex; | ||
2 | |||
3 | #[derive(Debug)] | ||
4 | pub struct DLListItem { | ||
5 | name: String, | ||
6 | value: String, | ||
7 | } | ||
8 | |||
9 | pub fn parse(s: &str) -> Vec<DLListItem> { | ||
10 | let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); | ||
11 | |||
12 | reg_list_item.captures_iter(s).map(|c| DLListItem { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() | ||
13 | } | ||
diff --git a/src/main.rs b/src/main.rs index ee8c3eb..e76da66 100644 --- a/src/main.rs +++ b/src/main.rs | |||
@@ -1,4 +1,5 @@ | |||
1 | extern crate regex; | 1 | extern crate regex; |
2 | use regex::Regex; | ||
2 | extern crate yaml_rust; | 3 | extern crate yaml_rust; |
3 | 4 | ||
4 | extern crate walkdir; | 5 | extern crate walkdir; |
@@ -11,27 +12,23 @@ use std::path::Path; | |||
11 | use std::collections::HashMap; | 12 | use std::collections::HashMap; |
12 | 13 | ||
13 | mod pre_process; | 14 | mod pre_process; |
14 | use pre_process::Section; | 15 | mod section; |
15 | 16 | use section::Section; | |
16 | mod config; | ||
17 | use config::Config; | ||
18 | 17 | ||
19 | mod character; | 18 | mod character; |
20 | use character::{Images, Names, Traits, Character}; | 19 | use character::{Images, Names, Traits, Character}; |
21 | 20 | ||
22 | mod tags; | 21 | mod tags; |
23 | mod traits; | 22 | mod dl_list; |
23 | use dl_list::DLListItem; | ||
24 | 24 | ||
25 | fn main() { | 25 | fn main() { |
26 | let raw_files = env::var("RAW_FILES").unwrap_or("characters".into()); | 26 | let raw_files = env::var("RAW_FILES").unwrap_or("characters".into()); |
27 | let base_path = Path::new(&raw_files); | 27 | let base_path = Path::new(&raw_files); |
28 | 28 | ||
29 | let cfg = Config::from_file("config.yml", vec!["name", "image", "misc", "tags", "traits"]); | 29 | let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#).unwrap(); |
30 | 30 | ||
31 | let mut sections: HashMap<String, Section> = HashMap::new(); | 31 | let mut sections: HashMap<String, Section> = character::get_sections(); |
32 | for (name, sec) in &cfg.sections { | ||
33 | sections.insert(name.clone(), Section::new(&name, &sec.pattern, sec.groups.clone())); | ||
34 | } | ||
35 | 32 | ||
36 | for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) { | 33 | for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) { |
37 | let mut f = File::open(entry.path()).expect("could not open file"); | 34 | let mut f = File::open(entry.path()).expect("could not open file"); |
@@ -43,7 +40,14 @@ fn main() { | |||
43 | 40 | ||
44 | let buf = pre_process::strip_irrelevant_content(&buf); | 41 | let buf = pre_process::strip_irrelevant_content(&buf); |
45 | 42 | ||
46 | pre_process::split_sections(&buf, &mut sections); | 43 | section::process(&buf, &mut sections); |
44 | |||
45 | // find optional extra details | ||
46 | let mut extra_details: Vec<DLListItem> = vec![]; | ||
47 | let caps = re_extras.captures(&buf); | ||
48 | if caps.is_some() { | ||
49 | extra_details = dl_list::parse(caps.unwrap().at(1).unwrap()); | ||
50 | } | ||
47 | 51 | ||
48 | { | 52 | { |
49 | let name: &Section = §ions["name".into()]; | 53 | let name: &Section = §ions["name".into()]; |
@@ -53,7 +57,10 @@ fn main() { | |||
53 | name: Names { | 57 | name: Names { |
54 | romaji: name.data["romaji".into()].clone(), | 58 | romaji: name.data["romaji".into()].clone(), |
55 | japanese: name.data["japanese".into()].clone(), | 59 | japanese: name.data["japanese".into()].clone(), |
56 | aliases: name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(), | 60 | aliases: match name.data["aliases".into()].len() > 0 { |
61 | true => name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(), | ||
62 | false => vec![] | ||
63 | } | ||
57 | }, | 64 | }, |
58 | image: Images { | 65 | image: Images { |
59 | thumb: image.data["thumb".into()].clone(), | 66 | thumb: image.data["thumb".into()].clone(), |
@@ -61,9 +68,10 @@ fn main() { | |||
61 | }, | 68 | }, |
62 | tags: tags::parse(&(§ions["tags".into()] as &Section).data["tags_raw".into()]), | 69 | tags: tags::parse(&(§ions["tags".into()] as &Section).data["tags_raw".into()]), |
63 | traits: Traits { | 70 | traits: Traits { |
64 | official: traits::parse(&(§ions["traits"] as &Section).data["official_raw".into()]), | 71 | official: dl_list::parse(&(§ions["traits"] as &Section).data["official_raw".into()]), |
65 | indexed: traits::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]), | 72 | indexed: dl_list::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]), |
66 | }, | 73 | }, |
74 | extra: extra_details, | ||
67 | role: match misc.data["role".into()].len() > 0 { | 75 | role: match misc.data["role".into()].len() > 0 { |
68 | true => Some(misc.data["role".into()].clone()), | 76 | true => Some(misc.data["role".into()].clone()), |
69 | false => None | 77 | false => None |
diff --git a/src/pre_process.rs b/src/pre_process.rs index 273562d..877ddc3 100644 --- a/src/pre_process.rs +++ b/src/pre_process.rs | |||
@@ -1,7 +1,3 @@ | |||
1 | use super::regex::Regex; | ||
2 | |||
3 | use std::collections::HashMap; | ||
4 | |||
5 | pub fn strip_irrelevant_content(s: &str) -> String { | 1 | pub fn strip_irrelevant_content(s: &str) -> String { |
6 | let mut retn = ""; | 2 | let mut retn = ""; |
7 | match s.find(r#"<div class=profile id=profile>"#) { | 3 | match s.find(r#"<div class=profile id=profile>"#) { |
@@ -16,34 +12,4 @@ pub fn strip_irrelevant_content(s: &str) -> String { | |||
16 | return retn.into(); | 12 | return retn.into(); |
17 | } | 13 | } |
18 | 14 | ||
19 | pub struct Section { | ||
20 | pub name: String, | ||
21 | pub re: Regex, | ||
22 | pub keys: Vec<String>, | ||
23 | pub data: HashMap<String, String>, | ||
24 | } | ||
25 | 15 | ||
26 | impl Section { | ||
27 | pub fn new(name: &str, re: &str, groups: Vec<String>) -> Self { | ||
28 | Section { | ||
29 | name: name.into(), | ||
30 | re: Regex::new(re).unwrap(), | ||
31 | keys: groups, | ||
32 | data: HashMap::new(), | ||
33 | } | ||
34 | } | ||
35 | } | ||
36 | |||
37 | pub fn split_sections(d: &str, s: &mut HashMap<String, Section>) { | ||
38 | for (_, section) in s { | ||
39 | for m in section.re.captures_iter(d) { | ||
40 | assert!(m.len() >= section.keys.len() + 1); | ||
41 | |||
42 | let mut idx = 0; | ||
43 | for key in §ion.keys { | ||
44 | section.data.insert(key.clone(), m.at(idx + 1).unwrap().into()); | ||
45 | idx += 1; | ||
46 | } | ||
47 | } | ||
48 | } | ||
49 | } | ||
diff --git a/src/section.rs b/src/section.rs new file mode 100644 index 0000000..7e492b1 --- /dev/null +++ b/src/section.rs | |||
@@ -0,0 +1,34 @@ | |||
1 | use super::regex::Regex; | ||
2 | use std::collections::HashMap; | ||
3 | |||
4 | pub struct Section { | ||
5 | pub name: String, | ||
6 | pub re: Regex, | ||
7 | pub keys: Vec<String>, | ||
8 | pub data: HashMap<String, String>, | ||
9 | } | ||
10 | |||
11 | impl Section { | ||
12 | pub fn new(name: &str, re: &str, groups: Vec<&'static str>) -> Self { | ||
13 | Section { | ||
14 | name: name.into(), | ||
15 | re: Regex::new(re).unwrap(), | ||
16 | keys: groups.into_iter().map(|s| s.into()).collect(), | ||
17 | data: HashMap::new(), | ||
18 | } | ||
19 | } | ||
20 | } | ||
21 | |||
22 | pub fn process(d: &str, s: &mut HashMap<String, Section>) { | ||
23 | for (_, section) in s { | ||
24 | for m in section.re.captures_iter(d) { | ||
25 | assert!(m.len() >= section.keys.len() + 1); | ||
26 | |||
27 | let mut idx = 0; | ||
28 | for key in §ion.keys { | ||
29 | section.data.insert(key.clone(), m.at(idx + 1).unwrap().into()); | ||
30 | idx += 1; | ||
31 | } | ||
32 | } | ||
33 | } | ||
34 | } | ||
diff --git a/src/traits.rs b/src/traits.rs deleted file mode 100644 index db1b2eb..0000000 --- a/src/traits.rs +++ /dev/null | |||
@@ -1,13 +0,0 @@ | |||
1 | use super::regex::Regex; | ||
2 | |||
3 | #[derive(Debug)] | ||
4 | pub struct Trait { | ||
5 | name: String, | ||
6 | value: String, | ||
7 | } | ||
8 | |||
9 | pub fn parse(s: &str) -> Vec<Trait> { | ||
10 | let reg_trait = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); | ||
11 | |||
12 | reg_trait.captures_iter(s).map(|c| Trait { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() | ||
13 | } | ||