aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjan <jan@ruken.pw>2016-09-18 14:50:23 (UTC)
committerjan <jan@ruken.pw>2016-09-18 14:50:23 (UTC)
commitefa8e3f091d196228215be82adfe1db20a5595ab (patch)
tree5fdc6b0f12d280f463ab5d5eed48404943b99a36
initial commitHEADmaster
-rw-r--r--.gitignore1
-rw-r--r--Cargo.toml10
-rw-r--r--src/main.rs75
3 files changed, 86 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..eb5a316
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
target
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..534b5ca
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,10 @@
1[package]
2name = "acd_crawl"
3version = "0.1.0"
4authors = ["jan <jan@ruken.pw>"]
5
6[dependencies]
7hyper = "0.9"
8rand = "0.3"
9log = "0.3"
10env_logger = "0.3" \ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..69a9c74
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,75 @@
1extern crate hyper;
2use hyper::client::Client;
3use hyper::header::UserAgent;
4use hyper::status::StatusCode;
5
6#[macro_use] extern crate log;
7extern crate env_logger;
8
9extern crate rand;
10use rand::{ thread_rng, Rng};
11
12use std::fs;
13use std::fs::File;
14use std::sync::Arc;
15use std::ops::Deref;
16use std::io::prelude::*;
17use std::env;
18use std::thread;
19use std::time::Duration;
20use std::str::FromStr;
21
22use std::path::Path;
23
24static MAX_CHARS: u32 = 77784;
25static BASE_URL: &'static str = "http://www.animecharactersdatabase.com/character.php?id=";
26
27fn get_next(base_path: &str) -> u32 {
28 let base = Path::new(base_path);
29
30 let mut num: u32 = thread_rng().gen_range(1, MAX_CHARS);
31 while base.join(format!("{}.html", num)).exists() {
32 num = thread_rng().gen_range(1, MAX_CHARS);
33 }
34 return num;
35}
36
37fn download(client: &Client, base_path: &str, char: u32) {
38 debug!("downloading character {}", char);
39 let mut res = client.get(&format!("{}{}", BASE_URL, char))
40 .header(UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14915".into()))
41 .send()
42 .expect("could not send request");
43
44 if res.status != StatusCode::Ok {
45 panic!("invalid status code {}", res.status);
46 }
47 debug!("got response");
48
49 let mut buf = String::new();
50 res.read_to_string(&mut buf).expect("could not read response");
51
52 // v I don't know whether that works anymore
53 if buf.find("IP ban").is_some() {
54 panic!("WE ARE BANNED");
55 }
56
57 let mut f = File::create(Path::new(base_path).join(format!("{}.html", char))).expect("could not create file");
58
59 f.write_all(buf.as_bytes()).expect("could not write response to file");
60}
61
62fn main() {
63 let timeout = u64::from_str(&(env::var("TIMEOUT").expect("TIMEOUT env var not set"))).expect("invalid TIMEOUT env var");
64 let base_path = env::var("OUT").expect("OUT not set");
65 env_logger::init().unwrap();
66 fs::create_dir_all("html").ok();
67
68 let client = Arc::new(Client::new());
69
70 loop {
71 info!("getting next character");
72 download(client.clone().deref(), &base_path, get_next(&base_path));
73 thread::sleep(Duration::from_secs(timeout));
74 }
75}