diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Cargo.toml | 10 | ||||
-rw-r--r-- | src/main.rs | 75 |
3 files changed, 86 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eb5a316 --- /dev/null +++ b/.gitignore | |||
@@ -0,0 +1 @@ | |||
target | |||
diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..534b5ca --- /dev/null +++ b/Cargo.toml | |||
@@ -0,0 +1,10 @@ | |||
1 | [package] | ||
2 | name = "acd_crawl" | ||
3 | version = "0.1.0" | ||
4 | authors = ["jan <jan@ruken.pw>"] | ||
5 | |||
6 | [dependencies] | ||
7 | hyper = "0.9" | ||
8 | rand = "0.3" | ||
9 | log = "0.3" | ||
10 | env_logger = "0.3" \ No newline at end of file | ||
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..69a9c74 --- /dev/null +++ b/src/main.rs | |||
@@ -0,0 +1,75 @@ | |||
1 | extern crate hyper; | ||
2 | use hyper::client::Client; | ||
3 | use hyper::header::UserAgent; | ||
4 | use hyper::status::StatusCode; | ||
5 | |||
6 | #[macro_use] extern crate log; | ||
7 | extern crate env_logger; | ||
8 | |||
9 | extern crate rand; | ||
10 | use rand::{ thread_rng, Rng}; | ||
11 | |||
12 | use std::fs; | ||
13 | use std::fs::File; | ||
14 | use std::sync::Arc; | ||
15 | use std::ops::Deref; | ||
16 | use std::io::prelude::*; | ||
17 | use std::env; | ||
18 | use std::thread; | ||
19 | use std::time::Duration; | ||
20 | use std::str::FromStr; | ||
21 | |||
22 | use std::path::Path; | ||
23 | |||
24 | static MAX_CHARS: u32 = 77784; | ||
25 | static BASE_URL: &'static str = "http://www.animecharactersdatabase.com/character.php?id="; | ||
26 | |||
27 | fn get_next(base_path: &str) -> u32 { | ||
28 | let base = Path::new(base_path); | ||
29 | |||
30 | let mut num: u32 = thread_rng().gen_range(1, MAX_CHARS); | ||
31 | while base.join(format!("{}.html", num)).exists() { | ||
32 | num = thread_rng().gen_range(1, MAX_CHARS); | ||
33 | } | ||
34 | return num; | ||
35 | } | ||
36 | |||
37 | fn download(client: &Client, base_path: &str, char: u32) { | ||
38 | debug!("downloading character {}", char); | ||
39 | let mut res = client.get(&format!("{}{}", BASE_URL, char)) | ||
40 | .header(UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14915".into())) | ||
41 | .send() | ||
42 | .expect("could not send request"); | ||
43 | |||
44 | if res.status != StatusCode::Ok { | ||
45 | panic!("invalid status code {}", res.status); | ||
46 | } | ||
47 | debug!("got response"); | ||
48 | |||
49 | let mut buf = String::new(); | ||
50 | res.read_to_string(&mut buf).expect("could not read response"); | ||
51 | |||
52 | // v I don't know whether that works anymore | ||
53 | if buf.find("IP ban").is_some() { | ||
54 | panic!("WE ARE BANNED"); | ||
55 | } | ||
56 | |||
57 | let mut f = File::create(Path::new(base_path).join(format!("{}.html", char))).expect("could not create file"); | ||
58 | |||
59 | f.write_all(buf.as_bytes()).expect("could not write response to file"); | ||
60 | } | ||
61 | |||
62 | fn main() { | ||
63 | let timeout = u64::from_str(&(env::var("TIMEOUT").expect("TIMEOUT env var not set"))).expect("invalid TIMEOUT env var"); | ||
64 | let base_path = env::var("OUT").expect("OUT not set"); | ||
65 | env_logger::init().unwrap(); | ||
66 | fs::create_dir_all("html").ok(); | ||
67 | |||
68 | let client = Arc::new(Client::new()); | ||
69 | |||
70 | loop { | ||
71 | info!("getting next character"); | ||
72 | download(client.clone().deref(), &base_path, get_next(&base_path)); | ||
73 | thread::sleep(Duration::from_secs(timeout)); | ||
74 | } | ||
75 | } | ||