diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/character.rs | 64 | ||||
-rw-r--r-- | src/dl_list.rs | 9 | ||||
-rw-r--r-- | src/main.rs | 20 | ||||
-rw-r--r-- | src/pre_process.rs | 6 | ||||
-rw-r--r-- | src/section.rs | 1 | ||||
-rw-r--r-- | src/series.rs | 35 | ||||
-rw-r--r-- | src/tags.rs | 9 |
7 files changed, 104 insertions, 40 deletions
diff --git a/src/character.rs b/src/character.rs index 0630a45..f4686cf 100644 --- a/src/character.rs +++ b/src/character.rs | |||
@@ -42,19 +42,29 @@ pub struct Character { | |||
42 | 42 | ||
43 | impl Names { | 43 | impl Names { |
44 | pub fn new() -> Self { | 44 | pub fn new() -> Self { |
45 | Names { romaji: String::new(), japanese: String::new(), aliases: vec![] } | 45 | Names { |
46 | romaji: String::new(), | ||
47 | japanese: String::new(), | ||
48 | aliases: vec![], | ||
49 | } | ||
46 | } | 50 | } |
47 | } | 51 | } |
48 | 52 | ||
49 | impl Images { | 53 | impl Images { |
50 | pub fn new() -> Self { | 54 | pub fn new() -> Self { |
51 | Images { thumb: String::new(), full: String::new() } | 55 | Images { |
56 | thumb: String::new(), | ||
57 | full: String::new(), | ||
58 | } | ||
52 | } | 59 | } |
53 | } | 60 | } |
54 | 61 | ||
55 | impl Traits { | 62 | impl Traits { |
56 | pub fn new() -> Self { | 63 | pub fn new() -> Self { |
57 | Traits { official: vec![], indexed: vec![] } | 64 | Traits { |
65 | official: vec![], | ||
66 | indexed: vec![], | ||
67 | } | ||
58 | } | 68 | } |
59 | } | 69 | } |
60 | 70 | ||
@@ -67,13 +77,14 @@ impl Character { | |||
67 | traits: Traits::new(), | 77 | traits: Traits::new(), |
68 | assignments: vec![], | 78 | assignments: vec![], |
69 | extra: vec![], | 79 | extra: vec![], |
70 | role: None | 80 | role: None, |
71 | } | 81 | } |
72 | } | 82 | } |
73 | 83 | ||
74 | pub fn parse(&mut self, buf: &str) { | 84 | pub fn parse(&mut self, buf: &str) { |
75 | let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#).unwrap(); | 85 | let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#) |
76 | let mut sections = get_sections(); | 86 | .unwrap(); |
87 | let mut sections = get_sections(); | ||
77 | section::process(&buf, &mut sections); | 88 | section::process(&buf, &mut sections); |
78 | 89 | ||
79 | let caps = re_extras.captures(&buf); | 90 | let caps = re_extras.captures(&buf); |
@@ -89,40 +100,57 @@ impl Character { | |||
89 | 100 | ||
90 | self.name.romaji = name.data["romaji".into()].clone(); | 101 | self.name.romaji = name.data["romaji".into()].clone(); |
91 | self.name.japanese = name.data["japanese".into()].clone(); | 102 | self.name.japanese = name.data["japanese".into()].clone(); |
92 | 103 | ||
93 | if name.data["aliases".into()].len() > 0 { | 104 | if name.data["aliases".into()].len() > 0 { |
94 | self.name.aliases = name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(); | 105 | self.name.aliases = name.data["aliases".into()] |
106 | .split(", ") | ||
107 | .map(|s| s.to_string()) | ||
108 | .collect(); | ||
95 | } | 109 | } |
96 | 110 | ||
97 | self.image.thumb = image.data["thumb".into()].clone(); | 111 | self.image.thumb = image.data["thumb".into()].clone(); |
98 | self.image.full = image.data["full".into()].clone(); | 112 | self.image.full = image.data["full".into()].clone(); |
99 | 113 | ||
100 | if tags.data.contains_key("tags_raw".into()) { | 114 | if tags.data.contains_key("tags_raw".into()) { |
101 | self.tags = tags::parse(&tags.data["tags_raw".into()]); | 115 | self.tags = tags::parse(&tags.data["tags_raw".into()]); |
102 | } | 116 | } |
103 | 117 | ||
104 | self.traits.official = dl_list::parse(&(§ions["traits"] as &Section).data["official_raw".into()]); | 118 | self.traits.official = |
105 | self.traits.indexed = dl_list::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]); | 119 | dl_list::parse(&(§ions["traits"] as &Section).data["official_raw".into()]); |
120 | self.traits.indexed = | ||
121 | dl_list::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]); | ||
106 | 122 | ||
107 | self.assignments = tiles::parse_tile_link_ids(&(§ions["assignments"] as &Section).data["raw".into()], "series"); | 123 | self.assignments = tiles::parse_tile_link_ids(&(§ions["assignments"] as &Section).data["raw".into()], "series"); |
108 | 124 | ||
109 | if misc.data["role".into()].len() > 0 { | 125 | if misc.data["role".into()].len() > 0 { |
110 | self.role = Some(misc.data["role".into()].clone()); | 126 | self.role = Some(misc.data["role".into()].clone()); |
111 | } | 127 | } |
112 | } | 128 | } |
113 | } | 129 | } |
114 | } | 130 | } |
115 | 131 | ||
116 | fn get_sections() -> HashMap<String, Section> { | 132 | fn get_sections() -> HashMap<String, Section> { |
117 | let mut s: HashMap<String, Section> = HashMap::new(); | 133 | let mut s: HashMap<String, Section> = HashMap::new(); |
118 | 134 | ||
119 | s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"])); | 135 | s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"])); |
120 | s.insert("misc".into(), Section::new("misc", r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#, vec!["role"])); | 136 | s.insert("misc".into(), |
137 | Section::new("misc", | ||
138 | r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#, | ||
139 | vec!["role"])); | ||
121 | s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"])); | 140 | s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"])); |
122 | s.insert("tags".into(), Section::new("tags", r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#, vec!["tags_raw"])); | 141 | s.insert("tags".into(), |
142 | Section::new("tags", | ||
143 | r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#, | ||
144 | vec!["tags_raw"])); | ||
123 | s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"])); | 145 | s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"])); |
124 | s.insert("assignments".into(), Section::new("assignments", r#"(?is)appears in the following</P>(.*?)</UL>"#, vec!["raw"])); | 146 | s.insert("assignments".into(), |
125 | s.insert("chars_similar_traits".into(), Section::new("assignments", r#"(?is)with Similar Traits</H3>(.*?)</UL>"#, vec!["raw"])); | 147 | Section::new("assignments", |
148 | r#"(?is)appears in the following</P>(.*?)</UL>"#, | ||
149 | vec!["raw"])); | ||
150 | s.insert("chars_similar_traits".into(), | ||
151 | Section::new("assignments", | ||
152 | r#"(?is)with Similar Traits</H3>(.*?)</UL>"#, | ||
153 | vec!["raw"])); | ||
126 | 154 | ||
127 | s | 155 | s |
128 | } | 156 | } |
diff --git a/src/dl_list.rs b/src/dl_list.rs index 3327388..17d49fc 100644 --- a/src/dl_list.rs +++ b/src/dl_list.rs | |||
@@ -9,5 +9,12 @@ pub struct DLListItem { | |||
9 | pub fn parse(s: &str) -> Vec<DLListItem> { | 9 | pub fn parse(s: &str) -> Vec<DLListItem> { |
10 | let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); | 10 | let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); |
11 | 11 | ||
12 | reg_list_item.captures_iter(s).map(|c| DLListItem { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() | 12 | reg_list_item.captures_iter(s) |
13 | .map(|c| { | ||
14 | DLListItem { | ||
15 | name: c.at(1).unwrap().into(), | ||
16 | value: c.at(2).unwrap().into(), | ||
17 | } | ||
18 | }) | ||
19 | .collect() | ||
13 | } | 20 | } |
diff --git a/src/main.rs b/src/main.rs index 7481376..f15a3dd 100644 --- a/src/main.rs +++ b/src/main.rs | |||
@@ -1,9 +1,9 @@ | |||
1 | #![feature(proc_macro)] | 1 | #![feature(proc_macro)] |
2 | #[macro_use] extern crate serde_derive; | 2 | #[macro_use] |
3 | extern crate serde_derive; | ||
3 | extern crate serde_json; | 4 | extern crate serde_json; |
4 | 5 | ||
5 | extern crate regex; | 6 | extern crate regex; |
6 | extern crate yaml_rust; | ||
7 | 7 | ||
8 | extern crate walkdir; | 8 | extern crate walkdir; |
9 | use walkdir::{WalkDir, DirEntry}; | 9 | use walkdir::{WalkDir, DirEntry}; |
@@ -38,8 +38,12 @@ fn main() { | |||
38 | 38 | ||
39 | let active_threads = Arc::new(AtomicUsize::new(0)); | 39 | let active_threads = Arc::new(AtomicUsize::new(0)); |
40 | 40 | ||
41 | 41 | ||
42 | let files: Arc<Mutex<Vec<DirEntry>>> = Arc::new(Mutex::new(WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()).collect())); | 42 | let files: Arc<Mutex<Vec<DirEntry>>> = Arc::new(Mutex::new(WalkDir::new(base_path) |
43 | .min_depth(1) | ||
44 | .into_iter() | ||
45 | .filter_map(|e| e.ok()) | ||
46 | .collect())); | ||
43 | 47 | ||
44 | for i in 0..MAX_THREADS { | 48 | for i in 0..MAX_THREADS { |
45 | let files = files.clone(); | 49 | let files = files.clone(); |
@@ -58,7 +62,7 @@ fn main() { | |||
58 | } | 62 | } |
59 | } | 63 | } |
60 | let entry = entry.unwrap(); | 64 | let entry = entry.unwrap(); |
61 | 65 | ||
62 | let mut f = File::open(entry.path()).expect("could not open file"); | 66 | let mut f = File::open(entry.path()).expect("could not open file"); |
63 | let mut buf = String::new(); | 67 | let mut buf = String::new(); |
64 | if let Err(_) = f.read_to_string(&mut buf) { | 68 | if let Err(_) = f.read_to_string(&mut buf) { |
@@ -68,12 +72,16 @@ fn main() { | |||
68 | 72 | ||
69 | let buf = pre_process::strip_irrelevant_content(&buf); | 73 | let buf = pre_process::strip_irrelevant_content(&buf); |
70 | 74 | ||
75 | println!("{:?}", entry.path()); | ||
71 | let mut char = Series::new(); | 76 | let mut char = Series::new(); |
72 | char.parse(&buf); | 77 | char.parse(&buf); |
73 | 78 | ||
74 | let json = serde_json::to_string(&char).unwrap(); | 79 | let json = serde_json::to_string(&char).unwrap(); |
75 | 80 | ||
76 | let out_file = out_path.join(entry.file_name().to_str().unwrap().replace("html", "json")); | 81 | let out_file = out_path.join(entry.file_name() |
82 | .to_str() | ||
83 | .unwrap() | ||
84 | .replace("html", "json")); | ||
77 | let mut o = File::create(&out_file).unwrap(); | 85 | let mut o = File::create(&out_file).unwrap(); |
78 | o.write_all(json.as_bytes()).unwrap(); | 86 | o.write_all(json.as_bytes()).unwrap(); |
79 | println!("{:?}", out_file); | 87 | println!("{:?}", out_file); |
diff --git a/src/pre_process.rs b/src/pre_process.rs index 877ddc3..c68528f 100644 --- a/src/pre_process.rs +++ b/src/pre_process.rs | |||
@@ -1,15 +1,13 @@ | |||
1 | pub fn strip_irrelevant_content(s: &str) -> String { | 1 | pub fn strip_irrelevant_content(s: &str) -> String { |
2 | let mut retn = ""; | 2 | let mut retn = ""; |
3 | match s.find(r#"<div class=profile id=profile>"#) { | 3 | match s.find(r#"<A href="watchdontwatch.php">"#) { |
4 | Some(pos) => retn = &s[pos..], | 4 | Some(pos) => retn = &s[pos..], |
5 | None => (), | 5 | None => (), |
6 | }; | 6 | }; |
7 | 7 | ||
8 | match s.find(r#"<INPUT style="font-size: 2em;" TYPE=SUBMIT NAME="votes" VALUE="Cast Votes">"#) { | 8 | match s.find(r#"<H3>Characters"#) { |
9 | Some(pos) => retn = &s[..pos], | 9 | Some(pos) => retn = &s[..pos], |
10 | None => (), | 10 | None => (), |
11 | }; | 11 | }; |
12 | return retn.into(); | 12 | return retn.into(); |
13 | } | 13 | } |
14 | |||
15 | |||
diff --git a/src/section.rs b/src/section.rs index 7e492b1..48f237d 100644 --- a/src/section.rs +++ b/src/section.rs | |||
@@ -1,6 +1,7 @@ | |||
1 | use super::regex::Regex; | 1 | use super::regex::Regex; |
2 | use std::collections::HashMap; | 2 | use std::collections::HashMap; |
3 | 3 | ||
4 | #[derive(Debug)] | ||
4 | pub struct Section { | 5 | pub struct Section { |
5 | pub name: String, | 6 | pub name: String, |
6 | pub re: Regex, | 7 | pub re: Regex, |
diff --git a/src/series.rs b/src/series.rs index d111fb8..3037e6a 100644 --- a/src/series.rs +++ b/src/series.rs | |||
@@ -21,12 +21,17 @@ pub struct Names { | |||
21 | #[derive(Debug, Serialize)] | 21 | #[derive(Debug, Serialize)] |
22 | pub struct Series { | 22 | pub struct Series { |
23 | pub name: Names, | 23 | pub name: Names, |
24 | pub tags: Vec<String>, | ||
24 | } | 25 | } |
25 | 26 | ||
26 | impl Names { | 27 | impl Names { |
27 | pub fn new() -> Self { | 28 | pub fn new() -> Self { |
28 | Names { | 29 | Names { |
29 | String::new(), String::new(), String::new(), String::new(), String::new() | 30 | english: String::new(), |
31 | aliases: String::new(), | ||
32 | romaji: String::new(), | ||
33 | furigana: String::new(), | ||
34 | japanese: String::new(), | ||
30 | } | 35 | } |
31 | } | 36 | } |
32 | } | 37 | } |
@@ -34,30 +39,40 @@ impl Names { | |||
34 | impl Series { | 39 | impl Series { |
35 | pub fn new() -> Self { | 40 | pub fn new() -> Self { |
36 | Series { | 41 | Series { |
37 | name: Names::new() | 42 | name: Names::new(), |
43 | tags: vec![], | ||
38 | } | 44 | } |
39 | } | 45 | } |
40 | 46 | ||
41 | pub fn parse(&mut self, buf: &str) { | 47 | pub fn parse(&mut self, buf: &str) { |
42 | let mut sections = get_sections(); | 48 | let mut sections = get_sections(); |
43 | section::process(&buf, &mut sections); | 49 | section::process(&buf, &mut sections); |
44 | 50 | ||
51 | let re_genre_tags = Regex::new(r#"(?is)Genre Tags.*?>(.*?)</td>"#).unwrap(); | ||
52 | let re_genre_tag = Regex::new(r#"[0-9]">(.*?)</A>"#).unwrap(); | ||
53 | |||
54 | for cap in re_genre_tags.captures_iter(&buf) { | ||
55 | self.tags = re_genre_tag.captures_iter(cap.at(1).unwrap()) | ||
56 | .map(|v| v.at(1).unwrap().to_string()) | ||
57 | .collect(); | ||
58 | } | ||
59 | |||
45 | { | 60 | { |
46 | let name: &Section = §ions["name".into()]; | 61 | let name: &Section = §ions["name".into()]; |
47 | 62 | ||
48 | self.name.english = name["english".into()]; | 63 | self.name.english = name.data["english".into()].to_string(); |
49 | self.name.aliases = name["aliases".into()]; | 64 | self.name.aliases = name.data["aliases".into()].to_string(); |
50 | self.name.romaji = name["romaji".into()]; | 65 | self.name.romaji = name.data["romaji".into()].to_string(); |
51 | self.name.furigana = name["furigana".into()]; | 66 | self.name.furigana = name.data["furigana".into()].to_string(); |
52 | self.name.japanese = name["japanese".into()]; | 67 | self.name.japanese = name.data["japanese".into()].to_string(); |
53 | } | 68 | } |
54 | } | 69 | } |
55 | } | 70 | } |
56 | 71 | ||
57 | fn get_sections() -> HashMap<String, Section> { | 72 | fn get_sections() -> HashMap<String, Section> { |
58 | let mut s: HashMap<String, Section> = HashMap::new(); | 73 | let mut s: HashMap<String, Section> = HashMap::new(); |
59 | 74 | ||
60 | s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?<TD>(.*?)</TD>.*?Aliases.*?<TD>(.*?)?</TD>.*?Romaji Title.*?<TD.*?>(.*?)</TD>.*?Furigana Title.*?<TD.*?>(.*?)</TD>.*?Japanese Title.*?<TD.*?>(.*?)</TD>"#, vec!["english", "aliases", "romaji", "furigana", "japanese"])); | 75 | s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?<TD>(.*?)<.*?Aliases.*?<TD>(.*?)<.*?Romaji Title.*?<TD.*?>(.*?)</.*?Furigana Title.*?<TD.*?>(.*?)</.*?Japanese Title.*?<TD.*?>(.*?)</"#, vec!["english", "aliases", "romaji", "furigana", "japanese"])); |
61 | 76 | ||
62 | s | 77 | s |
63 | } | 78 | } |
diff --git a/src/tags.rs b/src/tags.rs index 561c54d..63db847 100644 --- a/src/tags.rs +++ b/src/tags.rs | |||
@@ -11,5 +11,12 @@ pub struct Tag { | |||
11 | pub fn parse(s: &str) -> Vec<Tag> { | 11 | pub fn parse(s: &str) -> Vec<Tag> { |
12 | let reg_tag = Regex::new(r#"(?is)<a href="tags\.php\?id=([0-9]+)">(.*?)</a>"#).unwrap(); | 12 | let reg_tag = Regex::new(r#"(?is)<a href="tags\.php\?id=([0-9]+)">(.*?)</a>"#).unwrap(); |
13 | 13 | ||
14 | reg_tag.captures_iter(s).map(|c| Tag { id: u32::from_str(c.at(1).unwrap()).unwrap(), name: c.at(2).unwrap().into() }).collect() | 14 | reg_tag.captures_iter(s) |
15 | .map(|c| { | ||
16 | Tag { | ||
17 | id: u32::from_str(c.at(1).unwrap()).unwrap(), | ||
18 | name: c.at(2).unwrap().into(), | ||
19 | } | ||
20 | }) | ||
21 | .collect() | ||
15 | } | 22 | } |