diff options
| author | jan <jan@ruken.pw> | 2016-09-18 14:44:22 (UTC) |
|---|---|---|
| committer | jan <jan@ruken.pw> | 2016-09-18 14:44:22 (UTC) |
| commit | 4ed4b0951982475bfacead6de53c505a2bc45f08 (patch) | |
| tree | 0c63b22bb0732e8c6219a51e8738c71c8fa23a59 | |
| -rw-r--r-- | .gitignore | 3 | ||||
| -rw-r--r-- | Cargo.lock | 116 | ||||
| -rw-r--r-- | Cargo.toml | 9 | ||||
| -rw-r--r-- | src/main.rs | 94 |
4 files changed, 222 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..42df15e --- /dev/null +++ b/.gitignore | |||
| @@ -0,0 +1,3 @@ | |||
| 1 | target | ||
| 2 | characters | ||
| 3 | series.txt \ No newline at end of file | ||
diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..4868267 --- /dev/null +++ b/Cargo.lock | |||
| @@ -0,0 +1,116 @@ | |||
| 1 | [root] | ||
| 2 | name = "acd_anime_export" | ||
| 3 | version = "0.1.0" | ||
| 4 | dependencies = [ | ||
| 5 | "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 6 | "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 7 | "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 8 | ] | ||
| 9 | |||
| 10 | [[package]] | ||
| 11 | name = "aho-corasick" | ||
| 12 | version = "0.5.3" | ||
| 13 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 14 | dependencies = [ | ||
| 15 | "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 16 | ] | ||
| 17 | |||
| 18 | [[package]] | ||
| 19 | name = "env_logger" | ||
| 20 | version = "0.3.5" | ||
| 21 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 22 | dependencies = [ | ||
| 23 | "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 24 | "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 25 | ] | ||
| 26 | |||
| 27 | [[package]] | ||
| 28 | name = "kernel32-sys" | ||
| 29 | version = "0.2.2" | ||
| 30 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 31 | dependencies = [ | ||
| 32 | "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 33 | "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 34 | ] | ||
| 35 | |||
| 36 | [[package]] | ||
| 37 | name = "libc" | ||
| 38 | version = "0.2.16" | ||
| 39 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 40 | |||
| 41 | [[package]] | ||
| 42 | name = "log" | ||
| 43 | version = "0.3.6" | ||
| 44 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 45 | |||
| 46 | [[package]] | ||
| 47 | name = "memchr" | ||
| 48 | version = "0.1.11" | ||
| 49 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 50 | dependencies = [ | ||
| 51 | "libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 52 | ] | ||
| 53 | |||
| 54 | [[package]] | ||
| 55 | name = "regex" | ||
| 56 | version = "0.1.77" | ||
| 57 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 58 | dependencies = [ | ||
| 59 | "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 60 | "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 61 | "regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 62 | "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 63 | "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 64 | ] | ||
| 65 | |||
| 66 | [[package]] | ||
| 67 | name = "regex-syntax" | ||
| 68 | version = "0.3.5" | ||
| 69 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 70 | |||
| 71 | [[package]] | ||
| 72 | name = "thread-id" | ||
| 73 | version = "2.0.0" | ||
| 74 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 75 | dependencies = [ | ||
| 76 | "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 77 | "libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 78 | ] | ||
| 79 | |||
| 80 | [[package]] | ||
| 81 | name = "thread_local" | ||
| 82 | version = "0.2.7" | ||
| 83 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 84 | dependencies = [ | ||
| 85 | "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", | ||
| 86 | ] | ||
| 87 | |||
| 88 | [[package]] | ||
| 89 | name = "utf8-ranges" | ||
| 90 | version = "0.1.3" | ||
| 91 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 92 | |||
| 93 | [[package]] | ||
| 94 | name = "winapi" | ||
| 95 | version = "0.2.8" | ||
| 96 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 97 | |||
| 98 | [[package]] | ||
| 99 | name = "winapi-build" | ||
| 100 | version = "0.1.1" | ||
| 101 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 102 | |||
| 103 | [metadata] | ||
| 104 | "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" | ||
| 105 | "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" | ||
| 106 | "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" | ||
| 107 | "checksum libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)" = "408014cace30ee0f767b1c4517980646a573ec61a57957aeeabcac8ac0a02e8d" | ||
| 108 | "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" | ||
| 109 | "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" | ||
| 110 | "checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" | ||
| 111 | "checksum regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279401017ae31cf4e15344aa3f085d0e2e5c1e70067289ef906906fdbe92c8fd" | ||
| 112 | "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" | ||
| 113 | "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" | ||
| 114 | "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" | ||
| 115 | "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" | ||
| 116 | "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" | ||
diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..bd253c6 --- /dev/null +++ b/Cargo.toml | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | [package] | ||
| 2 | name = "acd_series_finder" | ||
| 3 | version = "0.1.0" | ||
| 4 | authors = ["rknshia"] | ||
| 5 | |||
| 6 | [dependencies] | ||
| 7 | log = "0.3" | ||
| 8 | env_logger = "0.3" | ||
| 9 | regex = "0.1" \ No newline at end of file | ||
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..7cf9df9 --- /dev/null +++ b/src/main.rs | |||
| @@ -0,0 +1,94 @@ | |||
| 1 | #[macro_use] extern crate log; | ||
| 2 | extern crate env_logger; | ||
| 3 | |||
| 4 | extern crate regex; | ||
| 5 | use regex::Regex; | ||
| 6 | |||
| 7 | use std::path::{Path, PathBuf}; | ||
| 8 | use std::fs::File; | ||
| 9 | use std::io::prelude::*; | ||
| 10 | use std::collections::BTreeMap; | ||
| 11 | use std::env; | ||
| 12 | use std::str::FromStr; | ||
| 13 | use std::sync::{Arc, Mutex}; | ||
| 14 | use std::sync::atomic::{AtomicUsize, Ordering}; | ||
| 15 | use std::thread; | ||
| 16 | |||
| 17 | static MAX_THREADS: u32 = 8; | ||
| 18 | |||
| 19 | fn main() { | ||
| 20 | env_logger::init().unwrap(); | ||
| 21 | |||
| 22 | let bp = env::var("HTML_FILES").expect("HTML_FILES not set"); | ||
| 23 | let p = Path::new(&bp); | ||
| 24 | if !p.exists() { | ||
| 25 | panic!("no HTML_FILES directory"); | ||
| 26 | } | ||
| 27 | |||
| 28 | let re_assignment_section = Regex::new(r#"(?mi)appears in the following</P>\s*(.*?)\s*</UL>\s*?</DIV>"#).unwrap(); | ||
| 29 | let re_anime_link = Regex::new(r#"<div class=tile3top><A href="series\.php\?id=([0-9]+)">(.*?)</A>\s*?</div>"#).unwrap(); | ||
| 30 | let ws = Regex::new(r"\s").unwrap(); | ||
| 31 | let mut animes: Arc<Mutex<BTreeMap<u32, String>>> = Arc::new(Mutex::new(BTreeMap::new())); | ||
| 32 | let mut files: Arc<Mutex<Vec<PathBuf>>> = Arc::new(Mutex::new(vec![])); | ||
| 33 | let mut threads_active = Arc::new(AtomicUsize::new(0)); | ||
| 34 | |||
| 35 | let dir = p.read_dir().expect("could not read html directory"); | ||
| 36 | { | ||
| 37 | let mut files = files.lock().unwrap(); | ||
| 38 | for f in dir { | ||
| 39 | files.push(f.unwrap().path().to_path_buf()); | ||
| 40 | } | ||
| 41 | } | ||
| 42 | |||
| 43 | for i in 1..MAX_THREADS { | ||
| 44 | let mut animes = animes.clone(); | ||
| 45 | let mut files = files.clone(); | ||
| 46 | let re_assignment_section = re_assignment_section.clone(); | ||
| 47 | let re_anime_link = re_anime_link.clone(); | ||
| 48 | let ws = ws.clone(); | ||
| 49 | let threads_active = threads_active.clone(); | ||
| 50 | threads_active.fetch_add(1, Ordering::SeqCst); | ||
| 51 | |||
| 52 | thread::spawn(move || { | ||
| 53 | loop { | ||
| 54 | let mut f: PathBuf; | ||
| 55 | { | ||
| 56 | f = match files.lock().unwrap().pop() { | ||
| 57 | None => { | ||
| 58 | debug!("thread finished"); | ||
| 59 | threads_active.fetch_sub(1, Ordering::SeqCst); | ||
| 60 | return; | ||
| 61 | }, | ||
| 62 | Some(s) => s | ||
| 63 | }; | ||
| 64 | } | ||
| 65 | |||
| 66 | let mut fh = File::open(&f).expect("could not open file"); | ||
| 67 | let mut buf = String::new(); | ||
| 68 | if let Err(_) = fh.read_to_string(&mut buf) { | ||
| 69 | error!("INVALID {:?}", f); | ||
| 70 | continue; | ||
| 71 | }; | ||
| 72 | buf = ws.replace_all(&buf, " "); | ||
| 73 | |||
| 74 | let section = re_assignment_section.captures(&buf).expect("no assignments for character?").at(1).unwrap(); | ||
| 75 | |||
| 76 | let mut animes = animes.lock().unwrap(); | ||
| 77 | for anime in re_anime_link.captures_iter(§ion) { | ||
| 78 | debug!("{}: {}", anime.at(1).unwrap(), anime.at(2).unwrap()); | ||
| 79 | animes.insert(u32::from_str(anime.at(1).unwrap()).unwrap(), anime.at(2).unwrap().into()); | ||
| 80 | } | ||
| 81 | } | ||
| 82 | }); | ||
| 83 | } | ||
| 84 | |||
| 85 | while threads_active.load(Ordering::SeqCst) != 0 {} | ||
| 86 | |||
| 87 | let mut animes = animes.lock().unwrap(); | ||
| 88 | println!("found {} series. writing series.txt", animes.len()); | ||
| 89 | |||
| 90 | let mut f = File::create("series.txt").expect("could not create series.txt"); | ||
| 91 | for (k, v) in animes.iter() { | ||
| 92 | f.write_all(format!("{:?};;{}\n", k, v).as_bytes()).expect("write failed"); | ||
| 93 | } | ||
| 94 | } | ||
