aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/main.rs10
-rw-r--r--src/pre_process.rs27
2 files changed, 26 insertions, 11 deletions
diff --git a/src/main.rs b/src/main.rs
index b733e6f..2123c6b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -13,15 +13,11 @@ mod pre_process;
13use pre_process::Section; 13use pre_process::Section;
14 14
15fn main() { 15fn main() {
16 let raw_files = env::var("RAW_FILES").unwrap_or("S:\\grilist\\acd\\acd_character_parser\\characters\\".into()); 16 let raw_files = env::var("RAW_FILES").unwrap_or("characters".into());
17 let base_path = Path::new(&raw_files); 17 let base_path = Path::new(&raw_files);
18 18
19 let mut sections: Vec<Section> = vec![]; 19 let mut sections: Vec<Section> = vec![];
20 sections.push(Section { 20 sections.push(Section::new("image", r#"(?is)<H3 id="section99">.*<img src="(.*?)" alt=.*?></a><p><a href="(.*?)">View Full Size Image"#, vec!["thumb".into(), "full".into()]));
21 name: "image".into(),
22 re: Regex::new(r#"(?is)<H3 id="section99">.*<p><a href="(.*?)">View Full Size Image"#).unwrap(),
23 content: String::new(),
24 });
25 21
26 for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) { 22 for entry in WalkDir::new(base_path).min_depth(1).into_iter().filter_map(|e| e.ok()) {
27 let mut f = File::open(entry.path()).expect("could not open file"); 23 let mut f = File::open(entry.path()).expect("could not open file");
@@ -35,6 +31,6 @@ fn main() {
35 31
36 pre_process::split_sections(&buf, &mut sections); 32 pre_process::split_sections(&buf, &mut sections);
37 33
38 println!("{}", sections[0].content); 34 println!("{:?}", sections[0].data);
39 } 35 }
40} 36}
diff --git a/src/pre_process.rs b/src/pre_process.rs
index d69cfce..0d8c6be 100644
--- a/src/pre_process.rs
+++ b/src/pre_process.rs
@@ -1,5 +1,7 @@
1use super::regex::Regex; 1use super::regex::Regex;
2 2
3use std::collections::HashMap;
4
3pub fn strip_irrelevant_content(s: &str) -> String { 5pub fn strip_irrelevant_content(s: &str) -> String {
4 let mut retn = ""; 6 let mut retn = "";
5 match s.find(r#"<div class=profile id=profile>"#) { 7 match s.find(r#"<div class=profile id=profile>"#) {
@@ -17,14 +19,31 @@ pub fn strip_irrelevant_content(s: &str) -> String {
17pub struct Section { 19pub struct Section {
18 pub name: String, 20 pub name: String,
19 pub re: Regex, 21 pub re: Regex,
20 pub content: String, 22 pub keys: Vec<String>,
23 pub data: HashMap<String, String>,
24}
25
26impl Section {
27 pub fn new(name: &str, re: &str, groups: Vec<String>) -> Self {
28 Section {
29 name: name.into(),
30 re: Regex::new(re).unwrap(),
31 keys: groups,
32 data: HashMap::new(),
33 }
34 }
21} 35}
22 36
23pub fn split_sections(d: &str, s: &mut Vec<Section>) { 37pub fn split_sections(d: &str, s: &mut Vec<Section>) {
24 for section in s { 38 for section in s {
25 for m in section.re.captures_iter(d) { 39 for m in section.re.captures_iter(d) {
26 assert!(m.len() > 1); 40 assert!(m.len() >= section.keys.len() + 1);
27 section.content = format!("{}", m.at(1).unwrap()); 41
42 let mut idx = 0;
43 for key in &section.keys {
44 section.data.insert(key.clone(), m.at(idx + 1).unwrap().into());
45 idx += 1;
46 }
28 } 47 }
29 } 48 }
30} \ No newline at end of file 49}