From 6d7a31be9c6fd413d407334c40d02c008b7334ec Mon Sep 17 00:00:00 2001 From: jan Date: Sun, 9 Oct 2016 11:28:08 +0200 Subject: fixierung diff --git a/Cargo.lock b/Cargo.lock index 3efbb7e..c1f35dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -173,3 +173,27 @@ name = "winapi-build" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[metadata] +"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" +"checksum dtoa 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0dd841b58510c9618291ffa448da2e4e0f699d984d436122372f446dae62263d" +"checksum itoa 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ae3088ea4baeceb0284ee9eea42f591226e6beaecf65373e41b38d95a1b8e7a1" +"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +"checksum libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)" = "408014cace30ee0f767b1c4517980646a573ec61a57957aeeabcac8ac0a02e8d" +"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" +"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c" +"checksum quote 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5a5071e94480b788e482dd13592c7221b75da33717fd0fd74aee76a01c40b35b" +"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" +"checksum regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279401017ae31cf4e15344aa3f085d0e2e5c1e70067289ef906906fdbe92c8fd" +"checksum serde 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "f6b2f52afebb4708a21b9d721dbac942daa010f8478c6486daa48d62d5c70508" +"checksum serde_codegen 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "d9b64ecfe57712501e861b303982b549cfd56aed0ebf58823b36093d1807d69b" +"checksum serde_codegen_internals 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "318f7e77aa5187391d74aaf4553d2189f56b0ce25e963414c951b97877ffdcec" +"checksum serde_derive 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "2c7c2b01e85ca1330ba408325f6e85b8b4bf980320b0bd3bc366510e457c443f" +"checksum serde_json 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e5b3bb42fa42265df8a1822b3db2090bc8f9e17e8142599c76a5b854bc4e7b5b" +"checksum syn 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "96fed4e825d615b0ffd74dabb1dc4c5a078ab44e2c8004798f01510edf6cf515" +"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" +"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum unicode-xid 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "36dff09cafb4ec7c8cf0023eb0b686cb6ce65499116a12201c9e11840ca01beb" +"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" +"checksum walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c66c0b9792f0a765345452775f3adbd28dde9d33f30d13e5dcc5ae17cf6f3780" +"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" +"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/Cargo.toml b/Cargo.toml index 34e4f6e..88c86b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,6 @@ authors = ["jan "] [dependencies] regex = "0.1" walkdir = "0.1" - serde = "0.8" serde_derive = "0.8" serde_json = "0.8" diff --git a/src/character.rs b/src/character.rs index 0630a45..f4686cf 100644 --- a/src/character.rs +++ b/src/character.rs @@ -42,19 +42,29 @@ pub struct Character { impl Names { pub fn new() -> Self { - Names { romaji: String::new(), japanese: String::new(), aliases: vec![] } + Names { + romaji: String::new(), + japanese: String::new(), + aliases: vec![], + } } } impl Images { pub fn new() -> Self { - Images { thumb: String::new(), full: String::new() } + Images { + thumb: String::new(), + full: String::new(), + } } } impl Traits { pub fn new() -> Self { - Traits { official: vec![], indexed: vec![] } + Traits { + official: vec![], + indexed: vec![], + } } } @@ -67,13 +77,14 @@ impl Character { traits: Traits::new(), assignments: vec![], extra: vec![], - role: None + role: None, } } pub fn parse(&mut self, buf: &str) { - let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+.*?
(.*?)
"#).unwrap(); - let mut sections = get_sections(); + let re_extras = Regex::new(r#"(?is)Extra Details \| [0-9]+.*?
(.*?)
"#) + .unwrap(); + let mut sections = get_sections(); section::process(&buf, &mut sections); let caps = re_extras.captures(&buf); @@ -89,40 +100,57 @@ impl Character { self.name.romaji = name.data["romaji".into()].clone(); self.name.japanese = name.data["japanese".into()].clone(); - + if name.data["aliases".into()].len() > 0 { - self.name.aliases = name.data["aliases".into()].split(", ").map(|s| s.to_string()).collect(); + self.name.aliases = name.data["aliases".into()] + .split(", ") + .map(|s| s.to_string()) + .collect(); } - + self.image.thumb = image.data["thumb".into()].clone(); self.image.full = image.data["full".into()].clone(); - + if tags.data.contains_key("tags_raw".into()) { self.tags = tags::parse(&tags.data["tags_raw".into()]); } - self.traits.official = dl_list::parse(&(§ions["traits"] as &Section).data["official_raw".into()]); - self.traits.indexed = dl_list::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]); + self.traits.official = + dl_list::parse(&(§ions["traits"] as &Section).data["official_raw".into()]); + self.traits.indexed = + dl_list::parse(&(§ions["traits"] as &Section).data["indexed_raw".into()]); self.assignments = tiles::parse_tile_link_ids(&(§ions["assignments"] as &Section).data["raw".into()], "series"); - + if misc.data["role".into()].len() > 0 { self.role = Some(misc.data["role".into()].clone()); } } - } + } } fn get_sections() -> HashMap { let mut s: HashMap = HashMap::new(); s.insert("name".into(), Section::new("name", r#"(?is)Romaji Name.*?(.*?)\s?.*?Japanese Name.*?(.*?)\s?.*?Aliases.*?(.*?)\s?"#, vec!["romaji", "japanese", "aliases"])); - s.insert("misc".into(), Section::new("misc", r#"(?is)Role.*?(.*?)\s?"#, vec!["role"])); + s.insert("misc".into(), + Section::new("misc", + r#"(?is)Role.*?(.*?)\s?"#, + vec!["role"])); s.insert("image".into(), Section::new("image", r#"(?is)

.*.*?

View Full Size Image"#, vec!["thumb", "full"])); - s.insert("tags".into(), Section::new("tags", r#"(?is)tagged as

.*?(.*?)"#, vec!["tags_raw"])); + s.insert("tags".into(), + Section::new("tags", + r#"(?is)tagged as

.*?(.*?)"#, + vec!["tags_raw"])); s.insert("traits".into(), Section::new("traits", r#"(?is)indexed traits

.*?
(.*?)
.*?official traits\s?

.*?
(.*?)
"#, vec!["indexed_raw", "official_raw"])); - s.insert("assignments".into(), Section::new("assignments", r#"(?is)appears in the following

(.*?)"#, vec!["raw"])); - s.insert("chars_similar_traits".into(), Section::new("assignments", r#"(?is)with Similar Traits

(.*?)"#, vec!["raw"])); + s.insert("assignments".into(), + Section::new("assignments", + r#"(?is)appears in the following

(.*?)"#, + vec!["raw"])); + s.insert("chars_similar_traits".into(), + Section::new("assignments", + r#"(?is)with Similar Traits(.*?)"#, + vec!["raw"])); s } diff --git a/src/dl_list.rs b/src/dl_list.rs index 3327388..17d49fc 100644 --- a/src/dl_list.rs +++ b/src/dl_list.rs @@ -9,5 +9,12 @@ pub struct DLListItem { pub fn parse(s: &str) -> Vec { let reg_list_item = Regex::new(r#"(?is)(.*?).*?
(.*?)
"#).unwrap(); - reg_list_item.captures_iter(s).map(|c| DLListItem { name: c.at(1).unwrap().into(), value: c.at(2).unwrap().into() }).collect() + reg_list_item.captures_iter(s) + .map(|c| { + DLListItem { + name: c.at(1).unwrap().into(), + value: c.at(2).unwrap().into(), + } + }) + .collect() } diff --git a/src/main.rs b/src/main.rs index 7481376..f15a3dd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,9 @@ #![feature(proc_macro)] -#[macro_use] extern crate serde_derive; +#[macro_use] +extern crate serde_derive; extern crate serde_json; extern crate regex; -extern crate yaml_rust; extern crate walkdir; use walkdir::{WalkDir, DirEntry}; @@ -38,8 +38,12 @@ fn main() { let active_threads = Arc::new(AtomicUsize::new(0)); - - let files: Arc>> = Arc::new(Mutex::new(WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()).collect())); + + let files: Arc>> = Arc::new(Mutex::new(WalkDir::new(base_path) + .min_depth(1) + .into_iter() + .filter_map(|e| e.ok()) + .collect())); for i in 0..MAX_THREADS { let files = files.clone(); @@ -58,7 +62,7 @@ fn main() { } } let entry = entry.unwrap(); - + let mut f = File::open(entry.path()).expect("could not open file"); let mut buf = String::new(); if let Err(_) = f.read_to_string(&mut buf) { @@ -68,12 +72,16 @@ fn main() { let buf = pre_process::strip_irrelevant_content(&buf); + println!("{:?}", entry.path()); let mut char = Series::new(); char.parse(&buf); let json = serde_json::to_string(&char).unwrap(); - let out_file = out_path.join(entry.file_name().to_str().unwrap().replace("html", "json")); + let out_file = out_path.join(entry.file_name() + .to_str() + .unwrap() + .replace("html", "json")); let mut o = File::create(&out_file).unwrap(); o.write_all(json.as_bytes()).unwrap(); println!("{:?}", out_file); diff --git a/src/pre_process.rs b/src/pre_process.rs index 877ddc3..c68528f 100644 --- a/src/pre_process.rs +++ b/src/pre_process.rs @@ -1,15 +1,13 @@ pub fn strip_irrelevant_content(s: &str) -> String { let mut retn = ""; - match s.find(r#"
"#) { + match s.find(r#""#) { Some(pos) => retn = &s[pos..], None => (), }; - match s.find(r#""#) { + match s.find(r#"

Characters"#) { Some(pos) => retn = &s[..pos], None => (), }; return retn.into(); } - - diff --git a/src/section.rs b/src/section.rs index 7e492b1..48f237d 100644 --- a/src/section.rs +++ b/src/section.rs @@ -1,6 +1,7 @@ use super::regex::Regex; use std::collections::HashMap; +#[derive(Debug)] pub struct Section { pub name: String, pub re: Regex, diff --git a/src/series.rs b/src/series.rs index d111fb8..3037e6a 100644 --- a/src/series.rs +++ b/src/series.rs @@ -21,12 +21,17 @@ pub struct Names { #[derive(Debug, Serialize)] pub struct Series { pub name: Names, + pub tags: Vec, } impl Names { pub fn new() -> Self { Names { - String::new(), String::new(), String::new(), String::new(), String::new() + english: String::new(), + aliases: String::new(), + romaji: String::new(), + furigana: String::new(), + japanese: String::new(), } } } @@ -34,30 +39,40 @@ impl Names { impl Series { pub fn new() -> Self { Series { - name: Names::new() + name: Names::new(), + tags: vec![], } } pub fn parse(&mut self, buf: &str) { - let mut sections = get_sections(); + let mut sections = get_sections(); section::process(&buf, &mut sections); + let re_genre_tags = Regex::new(r#"(?is)Genre Tags.*?>(.*?)"#).unwrap(); + let re_genre_tag = Regex::new(r#"[0-9]">(.*?)"#).unwrap(); + + for cap in re_genre_tags.captures_iter(&buf) { + self.tags = re_genre_tag.captures_iter(cap.at(1).unwrap()) + .map(|v| v.at(1).unwrap().to_string()) + .collect(); + } + { let name: &Section = §ions["name".into()]; - self.name.english = name["english".into()]; - self.name.aliases = name["aliases".into()]; - self.name.romaji = name["romaji".into()]; - self.name.furigana = name["furigana".into()]; - self.name.japanese = name["japanese".into()]; + self.name.english = name.data["english".into()].to_string(); + self.name.aliases = name.data["aliases".into()].to_string(); + self.name.romaji = name.data["romaji".into()].to_string(); + self.name.furigana = name.data["furigana".into()].to_string(); + self.name.japanese = name.data["japanese".into()].to_string(); } - } + } } fn get_sections() -> HashMap { let mut s: HashMap = HashMap::new(); - s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?(.*?).*?Aliases.*?(.*?)?.*?Romaji Title.*?(.*?).*?Furigana Title.*?(.*?).*?Japanese Title.*?(.*?)"#, vec!["english", "aliases", "romaji", "furigana", "japanese"])); + s.insert("name".into(), Section::new("name", r#"(?is)English Title.*?(.*?)<.*?Aliases.*?(.*?)<.*?Romaji Title.*?(.*?)(.*?)(.*?) Vec { let reg_tag = Regex::new(r#"(?is)(.*?)"#).unwrap(); - reg_tag.captures_iter(s).map(|c| Tag { id: u32::from_str(c.at(1).unwrap()).unwrap(), name: c.at(2).unwrap().into() }).collect() + reg_tag.captures_iter(s) + .map(|c| { + Tag { + id: u32::from_str(c.at(1).unwrap()).unwrap(), + name: c.at(2).unwrap().into(), + } + }) + .collect() } -- cgit v0.10.1