aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/character.rs64
-rw-r--r--src/dl_list.rs9
-rw-r--r--src/main.rs20
-rw-r--r--src/pre_process.rs6
-rw-r--r--src/section.rs1
-rw-r--r--src/series.rs35
-rw-r--r--src/tags.rs9
7 files changed, 104 insertions, 40 deletions
diff --git a/src/character.rs b/src/character.rs
index 0630a45..f4686cf 100644
--- a/src/character.rs
+++ b/src/character.rs
@@ -42,19 +42,29 @@ pub struct Character {
42 42
43impl Names { 43impl Names {
44 pub fn new() -> Self { 44 pub fn new() -> Self {
45 Names { romaji: String::new(), japanese: String::new(), aliases: vec![] } 45 Names {
46 romaji: String::new(),
47 japanese: String::new(),
48 aliases: vec![],
49 }
46 } 50 }
47} 51}
48 52
49impl Images { 53impl Images {
50 pub fn new() -> Self { 54 pub fn new() -> Self {
51 Images { thumb: String::new(), full: String::new() } 55 Images {
56 thumb: String::new(),
57 full: String::new(),
58 }
52 } 59 }
53} 60}
54 61
55impl Traits { 62impl Traits {
56 pub fn new() -> Self { 63 pub fn new() -> Self {
57 Traits { official: vec![], indexed: vec![] } 64 Traits {
65 official: vec![],
66 indexed: vec![],
67 }
58 } 68 }
59} 69}
60 70
@@ -67,13 +77,14 @@ impl Character {
67 traits: Traits::new(), 77 traits: Traits::new(),
68 assignments: vec![], 78 assignments: vec![],
69 extra: vec![], 79 extra: vec![],
70 role: None 80 role: None,
71 } 81 }
72 } 82 }
73 83
74 pub fn parse(&mut self, buf: &str) { 84 pub fn parse(&mut self, buf: &str) {
75 let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#).unwrap(); 85 let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#)
76 let mut sections = get_sections(); 86 .unwrap();
87 let mut sections = get_sections();
77 section::process(&buf, &mut sections); 88 section::process(&buf, &mut sections);
78 89
79 let caps = re_extras.captures(&buf); 90 let caps = re_extras.captures(&buf);
@@ -89,40 +100,57 @@ impl Character {
89 100
90 self.name.romaji = name.data["romaji".into()].clone(); 101 self.name.romaji = name.data["romaji".into()].clone();
91 self.name.japanese = name.data["japanese".into()].clone(); 102 self.name.japanese = name.data["japanese".into()].clone();
92 103
93 if name.data["aliases".into()].len() > 0 { 104 if name.data["aliases".into()].len() > 0 {
94 self.name.aliases = name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(); 105 self.name.aliases = name.data["aliases".into()]
106 .split(", ")
107 .map(|s| s.to_string())
108 .collect();
95 } 109 }
96 110
97 self.image.thumb = image.data["thumb".into()].clone(); 111 self.image.thumb = image.data["thumb".into()].clone();
98 self.image.full = image.data["full".into()].clone(); 112 self.image.full = image.data["full".into()].clone();
99 113
100 if tags.data.contains_key("tags_raw".into()) { 114 if tags.data.contains_key("tags_raw".into()) {
101 self.tags = tags::parse(&tags.data["tags_raw".into()]); 115 self.tags = tags::parse(&tags.data["tags_raw".into()]);
102 } 116 }
103 117
104 self.traits.official = dl_list::parse(&(&sections["traits"] as &Section).data["official_raw".into()]); 118 self.traits.official =
105 self.traits.indexed = dl_list::parse(&(&sections["traits"] as &Section).data["indexed_raw".into()]); 119 dl_list::parse(&(&sections["traits"] as &Section).data["official_raw".into()]);
120 self.traits.indexed =
121 dl_list::parse(&(&sections["traits"] as &Section).data["indexed_raw".into()]);
106 122
107 self.assignments = tiles::parse_tile_link_ids(&(&sections["assignments"] as &Section).data["raw".into()], "series"); 123 self.assignments = tiles::parse_tile_link_ids(&(&sections["assignments"] as &Section).data["raw".into()], "series");
108 124
109 if misc.data["role".into()].len() > 0 { 125 if misc.data["role".into()].len() > 0 {
110 self.role = Some(misc.data["role".into()].clone()); 126 self.role = Some(misc.data["role".into()].clone());
111 } 127 }
112 } 128 }
113 } 129 }
114} 130}
115 131
116fn get_sections() -> HashMap<String, Section> { 132fn get_sections() -> HashMap<String, Section> {
117 let mut s: HashMap<String, Section> = HashMap::new(); 133 let mut s: HashMap<String, Section> = HashMap::new();
118 134
119 s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"])); 135 s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"]));
120 s.insert("misc".into(), Section::new("misc", r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#, vec!["role"])); 136 s.insert("misc".into(),
137 Section::new("misc",
138 r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#,
139 vec!["role"]));
121 s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"])); 140 s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"]));
122 s.insert("tags".into(), Section::new("tags", r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#, vec!["tags_raw"])); 141 s.insert("tags".into(),
142 Section::new("tags",
143 r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#,
144 vec!["tags_raw"]));
123 s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"])); 145 s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"]));
124 s.insert("assignments".into(), Section::new("assignments", r#"(?is)appears in the following</P>(.*?)</UL>"#, vec!["raw"])); 146 s.insert("assignments".into(),
125 s.insert("chars_similar_traits".into(), Section::new("assignments", r#"(?is)with Similar Traits</H3>(.*?)</UL>"#, vec!["raw"])); 147 Section::new("assignments",
148 r#"(?is)appears in the following</P>(.*?)</UL>"#,
149 vec!["raw"]));
150 s.insert("chars_similar_traits".into(),
151 Section::new("assignments",
152 r#"(?is)with Similar Traits</H3>(.*?)</UL>"#,
153 vec!["raw"]));
126 154
127 s 155 s
128} 156}
diff --git a/src/dl_list.rs b/src/dl_list.rs
index 3327388..17d49fc 100644
--- a/src/dl_list.rs
+++ b/src/dl_list.rs
@@ -9,5 +9,12 @@ pub struct DLListItem {
9pub fn parse(s: &str) -> Vec<DLListItem> { 9pub fn parse(s: &str) -> Vec<DLListItem> {
10 let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); 10 let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap();
11 11
12 reg_list_item.captures_iter(s).map(|c| DLListItem { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() 12 reg_list_item.captures_iter(s)
13 .map(|c| {
14 DLListItem {
15 name: c.at(1).unwrap().into(),
16 value: c.at(2).unwrap().into(),
17 }
18 })
19 .collect()
13} 20}
diff --git a/src/main.rs b/src/main.rs
index 7481376..f15a3dd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,9 +1,9 @@
1#![feature(proc_macro)] 1#![feature(proc_macro)]
2#[macro_use] extern crate serde_derive; 2#[macro_use]
3extern crate serde_derive;
3extern crate serde_json; 4extern crate serde_json;
4 5
5extern crate regex; 6extern crate regex;
6extern crate yaml_rust;
7 7
8extern crate walkdir; 8extern crate walkdir;
9use walkdir::{WalkDir, DirEntry}; 9use walkdir::{WalkDir, DirEntry};
@@ -38,8 +38,12 @@ fn main() {
38 38
39 let active_threads = Arc::new(AtomicUsize::new(0)); 39 let active_threads = Arc::new(AtomicUsize::new(0));
40 40
41 41
42 let files: Arc<Mutex<Vec<DirEntry>>> = Arc::new(Mutex::new(WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()).collect())); 42 let files: Arc<Mutex<Vec<DirEntry>>> = Arc::new(Mutex::new(WalkDir::new(base_path)
43 .min_depth(1)
44 .into_iter()
45 .filter_map(|e| e.ok())
46 .collect()));
43 47
44 for i in 0..MAX_THREADS { 48 for i in 0..MAX_THREADS {
45 let files = files.clone(); 49 let files = files.clone();
@@ -58,7 +62,7 @@ fn main() {
58 } 62 }
59 } 63 }
60 let entry = entry.unwrap(); 64 let entry = entry.unwrap();
61 65
62 let mut f = File::open(entry.path()).expect("could not open file"); 66 let mut f = File::open(entry.path()).expect("could not open file");
63 let mut buf = String::new(); 67 let mut buf = String::new();
64 if let Err(_) = f.read_to_string(&mut buf) { 68 if let Err(_) = f.read_to_string(&mut buf) {
@@ -68,12 +72,16 @@ fn main() {
68 72
69 let buf = pre_process::strip_irrelevant_content(&buf); 73 let buf = pre_process::strip_irrelevant_content(&buf);
70 74
75 println!("{:?}", entry.path());
71 let mut char = Series::new(); 76 let mut char = Series::new();
72 char.parse(&buf); 77 char.parse(&buf);
73 78
74 let json = serde_json::to_string(&char).unwrap(); 79 let json = serde_json::to_string(&char).unwrap();
75 80
76 let out_file = out_path.join(entry.file_name().to_str().unwrap().replace("html", "json")); 81 let out_file = out_path.join(entry.file_name()
82 .to_str()
83 .unwrap()
84 .replace("html", "json"));
77 let mut o = File::create(&out_file).unwrap(); 85 let mut o = File::create(&out_file).unwrap();
78 o.write_all(json.as_bytes()).unwrap(); 86 o.write_all(json.as_bytes()).unwrap();
79 println!("{:?}", out_file); 87 println!("{:?}", out_file);
diff --git a/src/pre_process.rs b/src/pre_process.rs
index 877ddc3..c68528f 100644
--- a/src/pre_process.rs
+++ b/src/pre_process.rs
@@ -1,15 +1,13 @@
1pub fn strip_irrelevant_content(s: &str) -> String { 1pub fn strip_irrelevant_content(s: &str) -> String {
2 let mut retn = ""; 2 let mut retn = "";
3 match s.find(r#"<div class=profile id=profile>"#) { 3 match s.find(r#"<A href="watchdontwatch.php">"#) {
4 Some(pos) => retn = &s[pos..], 4 Some(pos) => retn = &s[pos..],
5 None => (), 5 None => (),
6 }; 6 };
7 7
8 match s.find(r#"<INPUT style="font-size: 2em;" TYPE=SUBMIT NAME="votes" VALUE="Cast Votes">"#) { 8 match s.find(r#"<H3>Characters"#) {
9 Some(pos) => retn = &s[..pos], 9 Some(pos) => retn = &s[..pos],
10 None => (), 10 None => (),
11 }; 11 };
12 return retn.into(); 12 return retn.into();
13} 13}
14
15
diff --git a/src/section.rs b/src/section.rs
index 7e492b1..48f237d 100644
--- a/src/section.rs
+++ b/src/section.rs
@@ -1,6 +1,7 @@
1use super::regex::Regex; 1use super::regex::Regex;
2use std::collections::HashMap; 2use std::collections::HashMap;
3 3
4#[derive(Debug)]
4pub struct Section { 5pub struct Section {
5 pub name: String, 6 pub name: String,
6 pub re: Regex, 7 pub re: Regex,
diff --git a/src/series.rs b/src/series.rs
index d111fb8..3037e6a 100644
--- a/src/series.rs
+++ b/src/series.rs
@@ -21,12 +21,17 @@ pub struct Names {
21#[derive(Debug, Serialize)] 21#[derive(Debug, Serialize)]
22pub struct Series { 22pub struct Series {
23 pub name: Names, 23 pub name: Names,
24 pub tags: Vec<String>,
24} 25}
25 26
26impl Names { 27impl Names {
27 pub fn new() -> Self { 28 pub fn new() -> Self {
28 Names { 29 Names {
29 String::new(), String::new(), String::new(), String::new(), String::new() 30 english: String::new(),
31 aliases: String::new(),
32 romaji: String::new(),
33 furigana: String::new(),
34 japanese: String::new(),
30 } 35 }
31 } 36 }
32} 37}
@@ -34,30 +39,40 @@ impl Names {
34impl Series { 39impl Series {
35 pub fn new() -> Self { 40 pub fn new() -> Self {
36 Series { 41 Series {
37 name: Names::new() 42 name: Names::new(),
43 tags: vec![],
38 } 44 }
39 } 45 }
40 46
41 pub fn parse(&mut self, buf: &str) { 47 pub fn parse(&mut self, buf: &str) {
42 let mut sections = get_sections(); 48 let mut sections = get_sections();
43 section::process(&buf, &mut sections); 49 section::process(&buf, &mut sections);
44 50
51 let re_genre_tags = Regex::new(r#"(?is)Genre Tags.*?>(.*?)</td>"#).unwrap();
52 let re_genre_tag = Regex::new(r#"[0-9]">(.*?)</A>"#).unwrap();
53
54 for cap in re_genre_tags.captures_iter(&buf) {
55 self.tags = re_genre_tag.captures_iter(cap.at(1).unwrap())
56 .map(|v| v.at(1).unwrap().to_string())
57 .collect();
58 }
59
45 { 60 {
46 let name: &Section = &sections["name".into()]; 61 let name: &Section = &sections["name".into()];
47 62
48 self.name.english = name["english".into()]; 63 self.name.english = name.data["english".into()].to_string();
49 self.name.aliases = name["aliases".into()]; 64 self.name.aliases = name.data["aliases".into()].to_string();
50 self.name.romaji = name["romaji".into()]; 65 self.name.romaji = name.data["romaji".into()].to_string();
51 self.name.furigana = name["furigana".into()]; 66 self.name.furigana = name.data["furigana".into()].to_string();
52 self.name.japanese = name["japanese".into()]; 67 self.name.japanese = name.data["japanese".into()].to_string();
53 } 68 }
54 } 69 }
55} 70}
56 71
57fn get_sections() -> HashMap<String, Section> { 72fn get_sections() -> HashMap<String, Section> {
58 let mut s: HashMap<String, Section> = HashMap::new(); 73 let mut s: HashMap<String, Section> = HashMap::new();
59 74
60 s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?<TD>(.*?)</TD>.*?Aliases.*?<TD>(.*?)?</TD>.*?Romaji Title.*?<TD.*?>(.*?)</TD>.*?Furigana Title.*?<TD.*?>(.*?)</TD>.*?Japanese Title.*?<TD.*?>(.*?)</TD>"#, vec!["english", "aliases", "romaji", "furigana", "japanese"])); 75 s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?<TD>(.*?)<.*?Aliases.*?<TD>(.*?)<.*?Romaji Title.*?<TD.*?>(.*?)</.*?Furigana Title.*?<TD.*?>(.*?)</.*?Japanese Title.*?<TD.*?>(.*?)</"#, vec!["english", "aliases", "romaji", "furigana", "japanese"]));
61 76
62 s 77 s
63} 78}
diff --git a/src/tags.rs b/src/tags.rs
index 561c54d..63db847 100644
--- a/src/tags.rs
+++ b/src/tags.rs
@@ -11,5 +11,12 @@ pub struct Tag {
11pub fn parse(s: &str) -> Vec<Tag> { 11pub fn parse(s: &str) -> Vec<Tag> {
12 let reg_tag = Regex::new(r#"(?is)<a href="tags\.php\?id=([0-9]+)">(.*?)</a>"#).unwrap(); 12 let reg_tag = Regex::new(r#"(?is)<a href="tags\.php\?id=([0-9]+)">(.*?)</a>"#).unwrap();
13 13
14 reg_tag.captures_iter(s).map(|c| Tag { id: u32::from_str(c.at(1).unwrap()).unwrap(), name: c.at(2).unwrap().into() }).collect() 14 reg_tag.captures_iter(s)
15 .map(|c| {
16 Tag {
17 id: u32::from_str(c.at(1).unwrap()).unwrap(),
18 name: c.at(2).unwrap().into(),
19 }
20 })
21 .collect()
15} 22}