aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjan <jan@ruken.pw>2016-10-01 10:45:55 (UTC)
committerjan <jan@ruken.pw>2016-10-01 10:45:55 (UTC)
commitad63b49b94bf4b4596e6420e37d265a57b77d731 (patch)
tree73234b1a5f4d34e6b1771e4309374fd05bebd881
parenta6b37fa5e1bd505adfae4888896be2a3aa49ec3a (diff)
config entfernt, extra details parsen
-rw-r--r--src/character.rs21
-rw-r--r--src/config.rs63
-rw-r--r--src/dl_list.rs13
-rw-r--r--src/main.rs36
-rw-r--r--src/pre_process.rs34
-rw-r--r--src/section.rs34
-rw-r--r--src/traits.rs13
7 files changed, 87 insertions, 127 deletions
diff --git a/src/character.rs b/src/character.rs
index 98c4850..b82ac18 100644
--- a/src/character.rs
+++ b/src/character.rs
@@ -1,5 +1,8 @@
1use super::tags::Tag; 1use super::tags::Tag;
2use super::traits::Trait; 2use super::dl_list::DLListItem;
3use super::section::Section;
4
5use std::collections::HashMap;
3 6
4#[derive(Debug)] 7#[derive(Debug)]
5pub struct Names { 8pub struct Names {
@@ -16,8 +19,8 @@ pub struct Images {
16 19
17#[derive(Debug)] 20#[derive(Debug)]
18pub struct Traits { 21pub struct Traits {
19 pub official: Vec<Trait>, 22 pub official: Vec<DLListItem>,
20 pub indexed: Vec<Trait>, 23 pub indexed: Vec<DLListItem>,
21} 24}
22 25
23#[derive(Debug)] 26#[derive(Debug)]
@@ -26,6 +29,18 @@ pub struct Character {
26 pub image: Images, 29 pub image: Images,
27 pub tags: Vec<Tag>, 30 pub tags: Vec<Tag>,
28 pub traits: Traits, 31 pub traits: Traits,
32 pub extra: Vec<DLListItem>,
29 33
30 pub role: Option<String> 34 pub role: Option<String>
31} 35}
36
37pub fn get_sections() -> HashMap<String, Section> {
38 let mut s: HashMap<String, Section> = HashMap::new();
39
40 s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"]));
41 s.insert("misc".into(), Section::new("misc", r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#, vec!["role"]));
42 s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"]));
43 s.insert("tags".into(), Section::new("tags", r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#, vec!["tags_raw"]));
44 s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"]));
45 s
46}
diff --git a/src/config.rs b/src/config.rs
deleted file mode 100644
index f491852..0000000
--- a/src/config.rs
+++ /dev/null
@@ -1,63 +0,0 @@
1extern crate yaml_rust;
2use self::yaml_rust::YamlLoader;
3
4use std::collections::HashMap;
5use std::fs::File;
6use std::io::prelude::*;
7
8pub struct SectionConfig {
9 pub pattern: String,
10 pub groups: Vec<String>,
11}
12
13pub struct Config {
14 pub sections: HashMap<String, SectionConfig>,
15}
16
17impl Config {
18 pub fn from_file(p: &str, expected: Vec<&'static str>) -> Self {
19 let mut f = File::open(p).unwrap();
20 let mut buf = String::new();
21 f.read_to_string(&mut buf).unwrap();
22 let docs = YamlLoader::load_from_str(&buf).unwrap();
23
24 let doc = &docs[0];
25
26 let mut sections: HashMap<String, SectionConfig> = HashMap::new();
27 for (name, entry) in doc["sections"].as_hash().unwrap() {
28 sections.insert(name.as_str().unwrap().into(),
29 SectionConfig {
30 pattern: entry["pattern"].as_str().unwrap().into(),
31 groups: entry["groups"]
32 .as_vec()
33 .unwrap()
34 .into_iter()
35 .map(|v| v.as_str().unwrap().into())
36 .collect(),
37 });
38 }
39
40
41 for ex in &expected {
42 if !sections.contains_key(&ex.to_string()) {
43 panic!("config: section '{}' not found", ex);
44 }
45 }
46
47 {
48 let traits = &sections["traits"];
49 if !traits.groups.contains(&"indexed_raw".to_string()) {
50 panic!("config: no group 'indexed_raw' found in section 'traits'");
51 }
52 if !traits.groups.contains(&"official_raw".to_string()) {
53 panic!("config: no group 'official_raw' found in section 'traits'");
54 }
55 let tags = &sections["tags"];
56 if !tags.groups.contains(&"tags_raw".to_string()) {
57 panic!("config: no group 'tags_raw' found in section 'tags'");
58 }
59 }
60
61 Config { sections: sections }
62 }
63}
diff --git a/src/dl_list.rs b/src/dl_list.rs
new file mode 100644
index 0000000..979c332
--- /dev/null
+++ b/src/dl_list.rs
@@ -0,0 +1,13 @@
1use super::regex::Regex;
2
3#[derive(Debug)]
4pub struct DLListItem {
5 name: String,
6 value: String,
7}
8
9pub fn parse(s: &str) -> Vec<DLListItem> {
10 let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap();
11
12 reg_list_item.captures_iter(s).map(|c| DLListItem { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect()
13}
diff --git a/src/main.rs b/src/main.rs
index ee8c3eb..e76da66 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
1extern crate regex; 1extern crate regex;
2use regex::Regex;
2extern crate yaml_rust; 3extern crate yaml_rust;
3 4
4extern crate walkdir; 5extern crate walkdir;
@@ -11,27 +12,23 @@ use std::path::Path;
11use std::collections::HashMap; 12use std::collections::HashMap;
12 13
13mod pre_process; 14mod pre_process;
14use pre_process::Section; 15mod section;
15 16use section::Section;
16mod config;
17use config::Config;
18 17
19mod character; 18mod character;
20use character::{Images, Names, Traits, Character}; 19use character::{Images, Names, Traits, Character};
21 20
22mod tags; 21mod tags;
23mod traits; 22mod dl_list;
23use dl_list::DLListItem;
24 24
25fn main() { 25fn main() {
26 let raw_files = env::var("RAW_FILES").unwrap_or("characters".into()); 26 let raw_files = env::var("RAW_FILES").unwrap_or("characters".into());
27 let base_path = Path::new(&raw_files); 27 let base_path = Path::new(&raw_files);
28 28
29 let cfg = Config::from_file("config.yml", vec!["name", "image", "misc", "tags", "traits"]); 29 let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#).unwrap();
30 30
31 let mut sections: HashMap<String, Section> = HashMap::new(); 31 let mut sections: HashMap<String, Section> = character::get_sections();
32 for (name, sec) in &cfg.sections {
33 sections.insert(name.clone(), Section::new(&name, &sec.pattern, sec.groups.clone()));
34 }
35 32
36 for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) { 33 for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) {
37 let mut f = File::open(entry.path()).expect("could not open file"); 34 let mut f = File::open(entry.path()).expect("could not open file");
@@ -43,7 +40,14 @@ fn main() {
43 40
44 let buf = pre_process::strip_irrelevant_content(&buf); 41 let buf = pre_process::strip_irrelevant_content(&buf);
45 42
46 pre_process::split_sections(&buf, &mut sections); 43 section::process(&buf, &mut sections);
44
45 // find optional extra details
46 let mut extra_details: Vec<DLListItem> = vec![];
47 let caps = re_extras.captures(&buf);
48 if caps.is_some() {
49 extra_details = dl_list::parse(caps.unwrap().at(1).unwrap());
50 }
47 51
48 { 52 {
49 let name: &Section = &sections["name".into()]; 53 let name: &Section = &sections["name".into()];
@@ -53,7 +57,10 @@ fn main() {
53 name: Names { 57 name: Names {
54 romaji: name.data["romaji".into()].clone(), 58 romaji: name.data["romaji".into()].clone(),
55 japanese: name.data["japanese".into()].clone(), 59 japanese: name.data["japanese".into()].clone(),
56 aliases: name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(), 60 aliases: match name.data["aliases".into()].len() > 0 {
61 true => name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(),
62 false => vec![]
63 }
57 }, 64 },
58 image: Images { 65 image: Images {
59 thumb: image.data["thumb".into()].clone(), 66 thumb: image.data["thumb".into()].clone(),
@@ -61,9 +68,10 @@ fn main() {
61 }, 68 },
62 tags: tags::parse(&(&sections["tags".into()] as &Section).data["tags_raw".into()]), 69 tags: tags::parse(&(&sections["tags".into()] as &Section).data["tags_raw".into()]),
63 traits: Traits { 70 traits: Traits {
64 official: traits::parse(&(&sections["traits"] as &Section).data["official_raw".into()]), 71 official: dl_list::parse(&(&sections["traits"] as &Section).data["official_raw".into()]),
65 indexed: traits::parse(&(&sections["traits"] as &Section).data["indexed_raw".into()]), 72 indexed: dl_list::parse(&(&sections["traits"] as &Section).data["indexed_raw".into()]),
66 }, 73 },
74 extra: extra_details,
67 role: match misc.data["role".into()].len() > 0 { 75 role: match misc.data["role".into()].len() > 0 {
68 true => Some(misc.data["role".into()].clone()), 76 true => Some(misc.data["role".into()].clone()),
69 false => None 77 false => None
diff --git a/src/pre_process.rs b/src/pre_process.rs
index 273562d..877ddc3 100644
--- a/src/pre_process.rs
+++ b/src/pre_process.rs
@@ -1,7 +1,3 @@
1use super::regex::Regex;
2
3use std::collections::HashMap;
4
5pub fn strip_irrelevant_content(s: &str) -> String { 1pub fn strip_irrelevant_content(s: &str) -> String {
6 let mut retn = ""; 2 let mut retn = "";
7 match s.find(r#"<div class=profile id=profile>"#) { 3 match s.find(r#"<div class=profile id=profile>"#) {
@@ -16,34 +12,4 @@ pub fn strip_irrelevant_content(s: &str) -> String {
16 return retn.into(); 12 return retn.into();
17} 13}
18 14
19pub struct Section {
20 pub name: String,
21 pub re: Regex,
22 pub keys: Vec<String>,
23 pub data: HashMap<String, String>,
24}
25 15
26impl Section {
27 pub fn new(name: &str, re: &str, groups: Vec<String>) -> Self {
28 Section {
29 name: name.into(),
30 re: Regex::new(re).unwrap(),
31 keys: groups,
32 data: HashMap::new(),
33 }
34 }
35}
36
37pub fn split_sections(d: &str, s: &mut HashMap<String, Section>) {
38 for (_, section) in s {
39 for m in section.re.captures_iter(d) {
40 assert!(m.len() >= section.keys.len() + 1);
41
42 let mut idx = 0;
43 for key in &section.keys {
44 section.data.insert(key.clone(), m.at(idx + 1).unwrap().into());
45 idx += 1;
46 }
47 }
48 }
49}
diff --git a/src/section.rs b/src/section.rs
new file mode 100644
index 0000000..7e492b1
--- /dev/null
+++ b/src/section.rs
@@ -0,0 +1,34 @@
1use super::regex::Regex;
2use std::collections::HashMap;
3
4pub struct Section {
5 pub name: String,
6 pub re: Regex,
7 pub keys: Vec<String>,
8 pub data: HashMap<String, String>,
9}
10
11impl Section {
12 pub fn new(name: &str, re: &str, groups: Vec<&'static str>) -> Self {
13 Section {
14 name: name.into(),
15 re: Regex::new(re).unwrap(),
16 keys: groups.into_iter().map(|s| s.into()).collect(),
17 data: HashMap::new(),
18 }
19 }
20}
21
22pub fn process(d: &str, s: &mut HashMap<String, Section>) {
23 for (_, section) in s {
24 for m in section.re.captures_iter(d) {
25 assert!(m.len() >= section.keys.len() + 1);
26
27 let mut idx = 0;
28 for key in &section.keys {
29 section.data.insert(key.clone(), m.at(idx + 1).unwrap().into());
30 idx += 1;
31 }
32 }
33 }
34}
diff --git a/src/traits.rs b/src/traits.rs
deleted file mode 100644
index db1b2eb..0000000
--- a/src/traits.rs
+++ /dev/null
@@ -1,13 +0,0 @@
1use super::regex::Regex;
2
3#[derive(Debug)]
4pub struct Trait {
5 name: String,
6 value: String,
7}
8
9pub fn parse(s: &str) -> Vec<Trait> {
10 let reg_trait = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap();
11
12 reg_trait.captures_iter(s).map(|c| Trait { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect()
13}