aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjan <jan@ruken.pw>2016-10-09 09:28:08 (UTC)
committerjan <jan@ruken.pw>2016-10-09 09:28:08 (UTC)
commit6d7a31be9c6fd413d407334c40d02c008b7334ec (patch)
treeb9e73e5651d2dc1d72833875ab37469b7f2c1b53
parent54c2cb1e484e09b4fcfe236aff70c01bf3bbaae0 (diff)
fixierung
-rw-r--r--Cargo.lock24
-rw-r--r--Cargo.toml1
-rw-r--r--src/character.rs64
-rw-r--r--src/dl_list.rs9
-rw-r--r--src/main.rs20
-rw-r--r--src/pre_process.rs6
-rw-r--r--src/section.rs1
-rw-r--r--src/series.rs35
-rw-r--r--src/tags.rs9
9 files changed, 128 insertions, 41 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 3efbb7e..c1f35dc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -173,3 +173,27 @@ name = "winapi-build"
173version = "0.1.1" 173version = "0.1.1"
174source = "registry+https://github.com/rust-lang/crates.io-index" 174source = "registry+https://github.com/rust-lang/crates.io-index"
175 175
176[metadata]
177"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
178"checksum dtoa 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0dd841b58510c9618291ffa448da2e4e0f699d984d436122372f446dae62263d"
179"checksum itoa 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ae3088ea4baeceb0284ee9eea42f591226e6beaecf65373e41b38d95a1b8e7a1"
180"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
181"checksum libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)" = "408014cace30ee0f767b1c4517980646a573ec61a57957aeeabcac8ac0a02e8d"
182"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
183"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c"
184"checksum quote 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5a5071e94480b788e482dd13592c7221b75da33717fd0fd74aee76a01c40b35b"
185"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665"
186"checksum regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279401017ae31cf4e15344aa3f085d0e2e5c1e70067289ef906906fdbe92c8fd"
187"checksum serde 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "f6b2f52afebb4708a21b9d721dbac942daa010f8478c6486daa48d62d5c70508"
188"checksum serde_codegen 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "d9b64ecfe57712501e861b303982b549cfd56aed0ebf58823b36093d1807d69b"
189"checksum serde_codegen_internals 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "318f7e77aa5187391d74aaf4553d2189f56b0ce25e963414c951b97877ffdcec"
190"checksum serde_derive 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "2c7c2b01e85ca1330ba408325f6e85b8b4bf980320b0bd3bc366510e457c443f"
191"checksum serde_json 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e5b3bb42fa42265df8a1822b3db2090bc8f9e17e8142599c76a5b854bc4e7b5b"
192"checksum syn 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "96fed4e825d615b0ffd74dabb1dc4c5a078ab44e2c8004798f01510edf6cf515"
193"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03"
194"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
195"checksum unicode-xid 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "36dff09cafb4ec7c8cf0023eb0b686cb6ce65499116a12201c9e11840ca01beb"
196"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
197"checksum walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c66c0b9792f0a765345452775f3adbd28dde9d33f30d13e5dcc5ae17cf6f3780"
198"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
199"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
diff --git a/Cargo.toml b/Cargo.toml
index 34e4f6e..88c86b4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,6 @@ authors = ["jan <jan@ruken.pw>"]
6[dependencies] 6[dependencies]
7regex = "0.1" 7regex = "0.1"
8walkdir = "0.1" 8walkdir = "0.1"
9
10serde = "0.8" 9serde = "0.8"
11serde_derive = "0.8" 10serde_derive = "0.8"
12serde_json = "0.8" 11serde_json = "0.8"
diff --git a/src/character.rs b/src/character.rs
index 0630a45..f4686cf 100644
--- a/src/character.rs
+++ b/src/character.rs
@@ -42,19 +42,29 @@ pub struct Character {
42 42
43impl Names { 43impl Names {
44 pub fn new() -> Self { 44 pub fn new() -> Self {
45 Names { romaji: String::new(), japanese: String::new(), aliases: vec![] } 45 Names {
46 romaji: String::new(),
47 japanese: String::new(),
48 aliases: vec![],
49 }
46 } 50 }
47} 51}
48 52
49impl Images { 53impl Images {
50 pub fn new() -> Self { 54 pub fn new() -> Self {
51 Images { thumb: String::new(), full: String::new() } 55 Images {
56 thumb: String::new(),
57 full: String::new(),
58 }
52 } 59 }
53} 60}
54 61
55impl Traits { 62impl Traits {
56 pub fn new() -> Self { 63 pub fn new() -> Self {
57 Traits { official: vec![], indexed: vec![] } 64 Traits {
65 official: vec![],
66 indexed: vec![],
67 }
58 } 68 }
59} 69}
60 70
@@ -67,13 +77,14 @@ impl Character {
67 traits: Traits::new(), 77 traits: Traits::new(),
68 assignments: vec![], 78 assignments: vec![],
69 extra: vec![], 79 extra: vec![],
70 role: None 80 role: None,
71 } 81 }
72 } 82 }
73 83
74 pub fn parse(&mut self, buf: &str) { 84 pub fn parse(&mut self, buf: &str) {
75 let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#).unwrap(); 85 let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+</H3>.*?<dl>(.*?)</dl>"#)
76 let mut sections = get_sections(); 86 .unwrap();
87 let mut sections = get_sections();
77 section::process(&buf, &mut sections); 88 section::process(&buf, &mut sections);
78 89
79 let caps = re_extras.captures(&buf); 90 let caps = re_extras.captures(&buf);
@@ -89,40 +100,57 @@ impl Character {
89 100
90 self.name.romaji = name.data["romaji".into()].clone(); 101 self.name.romaji = name.data["romaji".into()].clone();
91 self.name.japanese = name.data["japanese".into()].clone(); 102 self.name.japanese = name.data["japanese".into()].clone();
92 103
93 if name.data["aliases".into()].len() > 0 { 104 if name.data["aliases".into()].len() > 0 {
94 self.name.aliases = name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(); 105 self.name.aliases = name.data["aliases".into()]
106 .split(", ")
107 .map(|s| s.to_string())
108 .collect();
95 } 109 }
96 110
97 self.image.thumb = image.data["thumb".into()].clone(); 111 self.image.thumb = image.data["thumb".into()].clone();
98 self.image.full = image.data["full".into()].clone(); 112 self.image.full = image.data["full".into()].clone();
99 113
100 if tags.data.contains_key("tags_raw".into()) { 114 if tags.data.contains_key("tags_raw".into()) {
101 self.tags = tags::parse(&tags.data["tags_raw".into()]); 115 self.tags = tags::parse(&tags.data["tags_raw".into()]);
102 } 116 }
103 117
104 self.traits.official = dl_list::parse(&(&sections["traits"] as &Section).data["official_raw".into()]); 118 self.traits.official =
105 self.traits.indexed = dl_list::parse(&(&sections["traits"] as &Section).data["indexed_raw".into()]); 119 dl_list::parse(&(&sections["traits"] as &Section).data["official_raw".into()]);
120 self.traits.indexed =
121 dl_list::parse(&(&sections["traits"] as &Section).data["indexed_raw".into()]);
106 122
107 self.assignments = tiles::parse_tile_link_ids(&(&sections["assignments"] as &Section).data["raw".into()], "series"); 123 self.assignments = tiles::parse_tile_link_ids(&(&sections["assignments"] as &Section).data["raw".into()], "series");
108 124
109 if misc.data["role".into()].len() > 0 { 125 if misc.data["role".into()].len() > 0 {
110 self.role = Some(misc.data["role".into()].clone()); 126 self.role = Some(misc.data["role".into()].clone());
111 } 127 }
112 } 128 }
113 } 129 }
114} 130}
115 131
116fn get_sections() -> HashMap<String, Section> { 132fn get_sections() -> HashMap<String, Section> {
117 let mut s: HashMap<String, Section> = HashMap::new(); 133 let mut s: HashMap<String, Section> = HashMap::new();
118 134
119 s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"])); 135 s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?<TD>(.*?)\s?</TD>.*?Japanese Name.*?<TD>(.*?)\s?</TD>.*?Aliases.*?<TD>(.*?)\s?</TD>"#, vec!["romaji", "japanese", "aliases"]));
120 s.insert("misc".into(), Section::new("misc", r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#, vec!["role"])); 136 s.insert("misc".into(),
137 Section::new("misc",
138 r#"(?is)Role</TH>.*?<TD>(.*?)\s?</TD>"#,
139 vec!["role"]));
121 s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"])); 140 s.insert("image".into(), Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb", "full"]));
122 s.insert("tags".into(), Section::new("tags", r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#, vec!["tags_raw"])); 141 s.insert("tags".into(),
142 Section::new("tags",
143 r#"(?is)tagged as</P>.*?<TH>(.*?)</TH>"#,
144 vec!["tags_raw"]));
123 s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"])); 145 s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits</P>.*?<dl>(.*?)</dl>.*?official traits\s?</P>.*?<dl>(.*?)</dl>"#, vec!["indexed_raw", "official_raw"]));
124 s.insert("assignments".into(), Section::new("assignments", r#"(?is)appears in the following</P>(.*?)</UL>"#, vec!["raw"])); 146 s.insert("assignments".into(),
125 s.insert("chars_similar_traits".into(), Section::new("assignments", r#"(?is)with Similar Traits</H3>(.*?)</UL>"#, vec!["raw"])); 147 Section::new("assignments",
148 r#"(?is)appears in the following</P>(.*?)</UL>"#,
149 vec!["raw"]));
150 s.insert("chars_similar_traits".into(),
151 Section::new("assignments",
152 r#"(?is)with Similar Traits</H3>(.*?)</UL>"#,
153 vec!["raw"]));
126 154
127 s 155 s
128} 156}
diff --git a/src/dl_list.rs b/src/dl_list.rs
index 3327388..17d49fc 100644
--- a/src/dl_list.rs
+++ b/src/dl_list.rs
@@ -9,5 +9,12 @@ pub struct DLListItem {
9pub fn parse(s: &str) -> Vec<DLListItem> { 9pub fn parse(s: &str) -> Vec<DLListItem> {
10 let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap(); 10 let reg_list_item = Regex::new(r#"(?is)<dt.*?>(.*?)</dt>.*?<dd>(.*?)</dd>"#).unwrap();
11 11
12 reg_list_item.captures_iter(s).map(|c| DLListItem { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() 12 reg_list_item.captures_iter(s)
13 .map(|c| {
14 DLListItem {
15 name: c.at(1).unwrap().into(),
16 value: c.at(2).unwrap().into(),
17 }
18 })
19 .collect()
13} 20}
diff --git a/src/main.rs b/src/main.rs
index 7481376..f15a3dd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,9 +1,9 @@
1#![feature(proc_macro)] 1#![feature(proc_macro)]
2#[macro_use] extern crate serde_derive; 2#[macro_use]
3extern crate serde_derive;
3extern crate serde_json; 4extern crate serde_json;
4 5
5extern crate regex; 6extern crate regex;
6extern crate yaml_rust;
7 7
8extern crate walkdir; 8extern crate walkdir;
9use walkdir::{WalkDir, DirEntry}; 9use walkdir::{WalkDir, DirEntry};
@@ -38,8 +38,12 @@ fn main() {
38 38
39 let active_threads = Arc::new(AtomicUsize::new(0)); 39 let active_threads = Arc::new(AtomicUsize::new(0));
40 40
41 41
42 let files: Arc<Mutex<Vec<DirEntry>>> = Arc::new(Mutex::new(WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()).collect())); 42 let files: Arc<Mutex<Vec<DirEntry>>> = Arc::new(Mutex::new(WalkDir::new(base_path)
43 .min_depth(1)
44 .into_iter()
45 .filter_map(|e| e.ok())
46 .collect()));
43 47
44 for i in 0..MAX_THREADS { 48 for i in 0..MAX_THREADS {
45 let files = files.clone(); 49 let files = files.clone();
@@ -58,7 +62,7 @@ fn main() {
58 } 62 }
59 } 63 }
60 let entry = entry.unwrap(); 64 let entry = entry.unwrap();
61 65
62 let mut f = File::open(entry.path()).expect("could not open file"); 66 let mut f = File::open(entry.path()).expect("could not open file");
63 let mut buf = String::new(); 67 let mut buf = String::new();
64 if let Err(_) = f.read_to_string(&mut buf) { 68 if let Err(_) = f.read_to_string(&mut buf) {
@@ -68,12 +72,16 @@ fn main() {
68 72
69 let buf = pre_process::strip_irrelevant_content(&buf); 73 let buf = pre_process::strip_irrelevant_content(&buf);
70 74
75 println!("{:?}", entry.path());
71 let mut char = Series::new(); 76 let mut char = Series::new();
72 char.parse(&buf); 77 char.parse(&buf);
73 78
74 let json = serde_json::to_string(&char).unwrap(); 79 let json = serde_json::to_string(&char).unwrap();
75 80
76 let out_file = out_path.join(entry.file_name().to_str().unwrap().replace("html", "json")); 81 let out_file = out_path.join(entry.file_name()
82 .to_str()
83 .unwrap()
84 .replace("html", "json"));
77 let mut o = File::create(&out_file).unwrap(); 85 let mut o = File::create(&out_file).unwrap();
78 o.write_all(json.as_bytes()).unwrap(); 86 o.write_all(json.as_bytes()).unwrap();
79 println!("{:?}", out_file); 87 println!("{:?}", out_file);
diff --git a/src/pre_process.rs b/src/pre_process.rs
index 877ddc3..c68528f 100644
--- a/src/pre_process.rs
+++ b/src/pre_process.rs
@@ -1,15 +1,13 @@
1pub fn strip_irrelevant_content(s: &str) -> String { 1pub fn strip_irrelevant_content(s: &str) -> String {
2 let mut retn = ""; 2 let mut retn = "";
3 match s.find(r#"<div class=profile id=profile>"#) { 3 match s.find(r#"<A href="watchdontwatch.php">"#) {
4 Some(pos) => retn = &s[pos..], 4 Some(pos) => retn = &s[pos..],
5 None => (), 5 None => (),
6 }; 6 };
7 7
8 match s.find(r#"<INPUT style="font-size: 2em;" TYPE=SUBMIT NAME="votes" VALUE="Cast Votes">"#) { 8 match s.find(r#"<H3>Characters"#) {
9 Some(pos) => retn = &s[..pos], 9 Some(pos) => retn = &s[..pos],
10 None => (), 10 None => (),
11 }; 11 };
12 return retn.into(); 12 return retn.into();
13} 13}
14
15
diff --git a/src/section.rs b/src/section.rs
index 7e492b1..48f237d 100644
--- a/src/section.rs
+++ b/src/section.rs
@@ -1,6 +1,7 @@
1use super::regex::Regex; 1use super::regex::Regex;
2use std::collections::HashMap; 2use std::collections::HashMap;
3 3
4#[derive(Debug)]
4pub struct Section { 5pub struct Section {
5 pub name: String, 6 pub name: String,
6 pub re: Regex, 7 pub re: Regex,
diff --git a/src/series.rs b/src/series.rs
index d111fb8..3037e6a 100644
--- a/src/series.rs
+++ b/src/series.rs
@@ -21,12 +21,17 @@ pub struct Names {
21#[derive(Debug, Serialize)] 21#[derive(Debug, Serialize)]
22pub struct Series { 22pub struct Series {
23 pub name: Names, 23 pub name: Names,
24 pub tags: Vec<String>,
24} 25}
25 26
26impl Names { 27impl Names {
27 pub fn new() -> Self { 28 pub fn new() -> Self {
28 Names { 29 Names {
29 String::new(), String::new(), String::new(), String::new(), String::new() 30 english: String::new(),
31 aliases: String::new(),
32 romaji: String::new(),
33 furigana: String::new(),
34 japanese: String::new(),
30 } 35 }
31 } 36 }
32} 37}
@@ -34,30 +39,40 @@ impl Names {
34impl Series { 39impl Series {
35 pub fn new() -> Self { 40 pub fn new() -> Self {
36 Series { 41 Series {
37 name: Names::new() 42 name: Names::new(),
43 tags: vec![],
38 } 44 }
39 } 45 }
40 46
41 pub fn parse(&mut self, buf: &str) { 47 pub fn parse(&mut self, buf: &str) {
42 let mut sections = get_sections(); 48 let mut sections = get_sections();
43 section::process(&buf, &mut sections); 49 section::process(&buf, &mut sections);
44 50
51 let re_genre_tags = Regex::new(r#"(?is)Genre Tags.*?>(.*?)</td>"#).unwrap();
52 let re_genre_tag = Regex::new(r#"[0-9]">(.*?)</A>"#).unwrap();
53
54 for cap in re_genre_tags.captures_iter(&buf) {
55 self.tags = re_genre_tag.captures_iter(cap.at(1).unwrap())
56 .map(|v| v.at(1).unwrap().to_string())
57 .collect();
58 }
59
45 { 60 {
46 let name: &Section = &sections["name".into()]; 61 let name: &Section = &sections["name".into()];
47 62
48 self.name.english = name["english".into()]; 63 self.name.english = name.data["english".into()].to_string();
49 self.name.aliases = name["aliases".into()]; 64 self.name.aliases = name.data["aliases".into()].to_string();
50 self.name.romaji = name["romaji".into()]; 65 self.name.romaji = name.data["romaji".into()].to_string();
51 self.name.furigana = name["furigana".into()]; 66 self.name.furigana = name.data["furigana".into()].to_string();
52 self.name.japanese = name["japanese".into()]; 67 self.name.japanese = name.data["japanese".into()].to_string();
53 } 68 }
54 } 69 }
55} 70}
56 71
57fn get_sections() -> HashMap<String, Section> { 72fn get_sections() -> HashMap<String, Section> {
58 let mut s: HashMap<String, Section> = HashMap::new(); 73 let mut s: HashMap<String, Section> = HashMap::new();
59 74
60 s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?<TD>(.*?)</TD>.*?Aliases.*?<TD>(.*?)?</TD>.*?Romaji Title.*?<TD.*?>(.*?)</TD>.*?Furigana Title.*?<TD.*?>(.*?)</TD>.*?Japanese Title.*?<TD.*?>(.*?)</TD>"#, vec!["english", "aliases", "romaji", "furigana", "japanese"])); 75 s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?<TD>(.*?)<.*?Aliases.*?<TD>(.*?)<.*?Romaji Title.*?<TD.*?>(.*?)</.*?Furigana Title.*?<TD.*?>(.*?)</.*?Japanese Title.*?<TD.*?>(.*?)</"#, vec!["english", "aliases", "romaji", "furigana", "japanese"]));
61 76
62 s 77 s
63} 78}
diff --git a/src/tags.rs b/src/tags.rs
index 561c54d..63db847 100644
--- a/src/tags.rs
+++ b/src/tags.rs
@@ -11,5 +11,12 @@ pub struct Tag {
11pub fn parse(s: &str) -> Vec<Tag> { 11pub fn parse(s: &str) -> Vec<Tag> {
12 let reg_tag = Regex::new(r#"(?is)<a href="tags\.php\?id=([0-9]+)">(.*?)</a>"#).unwrap(); 12 let reg_tag = Regex::new(r#"(?is)<a href="tags\.php\?id=([0-9]+)">(.*?)</a>"#).unwrap();
13 13
14 reg_tag.captures_iter(s).map(|c| Tag { id: u32::from_str(c.at(1).unwrap()).unwrap(), name: c.at(2).unwrap().into() }).collect() 14 reg_tag.captures_iter(s)
15 .map(|c| {
16 Tag {
17 id: u32::from_str(c.at(1).unwrap()).unwrap(),
18 name: c.at(2).unwrap().into(),
19 }
20 })
21 .collect()
15} 22}