diff options
| author | rtz12 <koenig@fagott.pw> | 2016-03-29 15:58:55 (UTC) |
|---|---|---|
| committer | rtz12 <koenig@fagott.pw> | 2016-03-29 15:58:55 (UTC) |
| commit | 1400381e9c193eb7f681d95616eed9070edaab56 (patch) | |
| tree | 2db90f612fcf10c0b4efada9f0e5a2e4f825a1a6 | |
| parent | 8661775d15c703ebfba4d3779d796849939c705b (diff) | |
Crawler hinzugefuegt
| -rw-r--r-- | tools/charcrawler/.gitignore | 2 | ||||
| -rw-r--r-- | tools/charcrawler/crawler/acd.go | 81 | ||||
| -rw-r--r-- | tools/charcrawler/crawler/acd_anime.go | 41 | ||||
| -rw-r--r-- | tools/charcrawler/crawler/anidb.go | 69 | ||||
| -rw-r--r-- | tools/charcrawler/crawler/crawler.go | 67 | ||||
| -rwxr-xr-x | tools/charcrawler/main | bin | 0 -> 9374264 bytes | |||
| -rw-r--r-- | tools/charcrawler/main.go | 16 |
7 files changed, 276 insertions, 0 deletions
diff --git a/tools/charcrawler/.gitignore b/tools/charcrawler/.gitignore new file mode 100644 index 0000000..02a07a1 --- /dev/null +++ b/tools/charcrawler/.gitignore | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | data/ | ||
| 2 | *.txt | ||
diff --git a/tools/charcrawler/crawler/acd.go b/tools/charcrawler/crawler/acd.go new file mode 100644 index 0000000..31efb0d --- /dev/null +++ b/tools/charcrawler/crawler/acd.go | |||
| @@ -0,0 +1,81 @@ | |||
| 1 | package crawler | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "log" | ||
| 5 | "strconv" | ||
| 6 | "strings" | ||
| 7 | |||
| 8 | "github.com/PuerkitoBio/goquery" | ||
| 9 | ) | ||
| 10 | |||
| 11 | type ACDCrawler struct{} | ||
| 12 | |||
| 13 | func (a ACDCrawler) Name() string { | ||
| 14 | return "ACD" | ||
| 15 | } | ||
| 16 | |||
| 17 | func (a ACDCrawler) Crawl(id int) (CharacterData, error) { | ||
| 18 | c := make(CharacterData) | ||
| 19 | doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/character.php?id=" + strconv.Itoa(id)) | ||
| 20 | if err != nil { | ||
| 21 | log.Println(err) | ||
| 22 | return nil, CrawlError | ||
| 23 | } | ||
| 24 | text := doc.Text() | ||
| 25 | if strings.Contains(text, "bad character : try") { | ||
| 26 | return nil, CharacterNotFound | ||
| 27 | } | ||
| 28 | dataTr := doc.Find("#besttable").Next().ChildrenFiltered("tbody").ChildrenFiltered("tr") | ||
| 29 | leftSide := dataTr.Children().Eq(0) | ||
| 30 | rightSide := dataTr.Children().Eq(1) | ||
| 31 | imageCols := rightSide.ChildrenFiltered("table").First().Find("tbody > tr > td") | ||
| 32 | val, _ := imageCols.Eq(0).Find("img").Attr("src") | ||
| 33 | c["__thumb"] = val | ||
| 34 | val, _ = imageCols.Eq(1).Find("a").Attr("href") | ||
| 35 | c["__img"] = val | ||
| 36 | leftSide.ChildrenFiltered("table").Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) { | ||
| 37 | c[s.Find("th").Text()] = s.Find("td").Text() | ||
| 38 | }) | ||
| 39 | var key string | ||
| 40 | leftSide. | ||
| 41 | ChildrenFiltered("table"). | ||
| 42 | Eq(0). | ||
| 43 | Find("td"). | ||
| 44 | Eq(1). | ||
| 45 | Find("dl"). | ||
| 46 | Children(). | ||
| 47 | Each(func(i int, s *goquery.Selection) { | ||
| 48 | switch goquery.NodeName(s) { | ||
| 49 | case "dt": | ||
| 50 | key = s.Text() | ||
| 51 | case "dd": | ||
| 52 | c[key] = s.Text() | ||
| 53 | } | ||
| 54 | }) | ||
| 55 | tags := make([]string, 0) | ||
| 56 | leftSide.ChildrenFiltered("div").Eq(0).Find("a").Each(func(i int, s *goquery.Selection) { | ||
| 57 | tags = append(tags, s.Text()) | ||
| 58 | }) | ||
| 59 | c["__tags"] = tags | ||
| 60 | vas := make([]string, 0) | ||
| 61 | leftSide.ChildrenFiltered("div").Eq(1).Find("a").Each(func(i int, s *goquery.Selection) { | ||
| 62 | vas = append(vas, s.Text()) | ||
| 63 | }) | ||
| 64 | c["__vas"] = vas | ||
| 65 | leftSide.ChildrenFiltered("dl").Children().Each(func(i int, s *goquery.Selection) { | ||
| 66 | switch goquery.NodeName(s) { | ||
| 67 | case "dt": | ||
| 68 | key = s.Text() | ||
| 69 | case "dd": | ||
| 70 | c[key] = s.Text() | ||
| 71 | } | ||
| 72 | }) | ||
| 73 | apps := make([]int, 0) | ||
| 74 | rightSide.Find(".tile3top").Each(func(i int, s *goquery.Selection) { | ||
| 75 | val, _ = s.Find("a").Attr("href") | ||
| 76 | id, _ := strconv.Atoi(strings.Split(val, "=")[1]) | ||
| 77 | apps = append(apps, id) | ||
| 78 | }) | ||
| 79 | c["__appearances"] = apps | ||
| 80 | return c, nil | ||
| 81 | } | ||
diff --git a/tools/charcrawler/crawler/acd_anime.go b/tools/charcrawler/crawler/acd_anime.go new file mode 100644 index 0000000..106655d --- /dev/null +++ b/tools/charcrawler/crawler/acd_anime.go | |||
| @@ -0,0 +1,41 @@ | |||
| 1 | package crawler | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "log" | ||
| 5 | "strconv" | ||
| 6 | "strings" | ||
| 7 | |||
| 8 | "github.com/PuerkitoBio/goquery" | ||
| 9 | ) | ||
| 10 | |||
| 11 | type ACDAnimeCrawler struct{} | ||
| 12 | |||
| 13 | func (a ACDAnimeCrawler) Name() string { | ||
| 14 | return "ACDAnime" | ||
| 15 | } | ||
| 16 | |||
| 17 | func (a ACDAnimeCrawler) Crawl(id int) (CharacterData, error) { | ||
| 18 | c := make(CharacterData) | ||
| 19 | doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/series.php?id=" + strconv.Itoa(id)) | ||
| 20 | if err != nil { | ||
| 21 | log.Println(err) | ||
| 22 | return nil, CrawlError | ||
| 23 | } | ||
| 24 | text := doc.Text() | ||
| 25 | if strings.Contains(text, "bad series : try") { | ||
| 26 | return nil, CharacterNotFound | ||
| 27 | } | ||
| 28 | tds := doc.Find("#besttable > table > tbody > tr > td") | ||
| 29 | val, _ := tds.Eq(0).Find("img").Attr("src") | ||
| 30 | c["__thumb"] = val | ||
| 31 | tds.Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) { | ||
| 32 | key := s.Find("th").Text() | ||
| 33 | value := s.Find("td").Text() | ||
| 34 | if key == "Home Page" { | ||
| 35 | val, _ = s.Find("td > a").Attr("href") | ||
| 36 | value = val | ||
| 37 | } | ||
| 38 | c[key] = value | ||
| 39 | }) | ||
| 40 | return c, nil | ||
| 41 | } | ||
diff --git a/tools/charcrawler/crawler/anidb.go b/tools/charcrawler/crawler/anidb.go new file mode 100644 index 0000000..12bf05b --- /dev/null +++ b/tools/charcrawler/crawler/anidb.go | |||
| @@ -0,0 +1,69 @@ | |||
| 1 | package crawler | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "log" | ||
| 5 | "strconv" | ||
| 6 | "strings" | ||
| 7 | |||
| 8 | "github.com/PuerkitoBio/goquery" | ||
| 9 | ) | ||
| 10 | |||
| 11 | type AniDBCrawler struct{} | ||
| 12 | |||
| 13 | func (a AniDBCrawler) Name() string { | ||
| 14 | return "AniDB" | ||
| 15 | } | ||
| 16 | |||
| 17 | func (a AniDBCrawler) Crawl(id int) (CharacterData, error) { | ||
| 18 | c := make(CharacterData) | ||
| 19 | doc, err := goquery.NewDocument("http://anidb.net/perl-bin/animedb.pl?show=character&charid=" + strconv.Itoa(id)) | ||
| 20 | if err != nil { | ||
| 21 | log.Println(err) | ||
| 22 | return nil, CrawlError | ||
| 23 | } | ||
| 24 | siteErr, _ := doc.Find(".error p").Html() | ||
| 25 | if strings.Contains(siteErr, "Unknown character id") { | ||
| 26 | return nil, CharacterNotFound | ||
| 27 | } else if strings.Contains(siteErr, "BANNED") { | ||
| 28 | return nil, CrawlError | ||
| 29 | } | ||
| 30 | first := true | ||
| 31 | links := make([]string, 0) | ||
| 32 | doc.Find(".characters").Each(func(i int, s *goquery.Selection) { | ||
| 33 | if first { | ||
| 34 | s.Find(".g_definitionlist tr").Each(func(i int, s *goquery.Selection) { | ||
| 35 | keyHtml, _ := s.Find("th").Html() | ||
| 36 | valueHtml, _ := s.Find("td").Html() | ||
| 37 | key := strings.TrimSpace(keyHtml) | ||
| 38 | value := strings.TrimSpace(valueHtml) | ||
| 39 | c[key] = value | ||
| 40 | }) | ||
| 41 | desc, _ := s.Find(".desc").Html() | ||
| 42 | c["__desc"] = strings.TrimSpace(desc) | ||
| 43 | imgUrl, _ := s.Find(".image img").Attr("src") | ||
| 44 | c["__img"] = imgUrl | ||
| 45 | animes := make([]map[string]string, 0) | ||
| 46 | s.Find(".animelist tr").Each(func(i int, s *goquery.Selection) { | ||
| 47 | if s.HasClass("header") { | ||
| 48 | return | ||
| 49 | } | ||
| 50 | anime := make(map[string]string) | ||
| 51 | relHtml, _ := s.Find(".relation").Html() | ||
| 52 | nameHtml, _ := s.Find(".name a").Html() | ||
| 53 | rel := strings.TrimSpace(relHtml) | ||
| 54 | name := strings.TrimSpace(nameHtml) | ||
| 55 | anime["rel"] = rel | ||
| 56 | anime["name"] = name | ||
| 57 | animes = append(animes, anime) | ||
| 58 | }) | ||
| 59 | c["__animes"] = animes | ||
| 60 | } else { | ||
| 61 | link, _ := s.Find(".mainname a").Attr("href") | ||
| 62 | links = append(links, strings.Replace(link, "http://anidb.net/ch", "", 1)) | ||
| 63 | } | ||
| 64 | first = false | ||
| 65 | }) | ||
| 66 | c["__links"] = links | ||
| 67 | |||
| 68 | return c, nil | ||
| 69 | } | ||
diff --git a/tools/charcrawler/crawler/crawler.go b/tools/charcrawler/crawler/crawler.go new file mode 100644 index 0000000..337bc76 --- /dev/null +++ b/tools/charcrawler/crawler/crawler.go | |||
| @@ -0,0 +1,67 @@ | |||
| 1 | package crawler | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "encoding/json" | ||
| 5 | "errors" | ||
| 6 | "io/ioutil" | ||
| 7 | "log" | ||
| 8 | "os" | ||
| 9 | "strconv" | ||
| 10 | "strings" | ||
| 11 | "time" | ||
| 12 | ) | ||
| 13 | |||
| 14 | type CharacterData map[string]interface{} | ||
| 15 | |||
| 16 | type Crawler interface { | ||
| 17 | Name() string | ||
| 18 | Crawl(id int) (CharacterData, error) | ||
| 19 | } | ||
| 20 | |||
| 21 | var Instances []Crawler | ||
| 22 | |||
| 23 | var ( | ||
| 24 | CrawlError = errors.New("Error while crawling") | ||
| 25 | CharacterNotFound = errors.New("Character not found") | ||
| 26 | ) | ||
| 27 | |||
| 28 | func Start(c Crawler) { | ||
| 29 | name := c.Name() | ||
| 30 | os.MkdirAll("data/"+name, 0755) | ||
| 31 | log.Printf("Starting Crawler %s...", name) | ||
| 32 | ticker := time.NewTicker(time.Second * 1) | ||
| 33 | current := 1 | ||
| 34 | if save, err := ioutil.ReadFile(name + ".txt"); err == nil { | ||
| 35 | s := strings.TrimSpace(string(save)) | ||
| 36 | if i, err := strconv.Atoi(s); err == nil { | ||
| 37 | current = i | ||
| 38 | } | ||
| 39 | } | ||
| 40 | faultCounter := 0 | ||
| 41 | for range ticker.C { | ||
| 42 | if faultCounter > 100 { | ||
| 43 | faultCounter = 0 | ||
| 44 | log.Printf("[%s] Exiting after 100 fails", name) | ||
| 45 | break | ||
| 46 | } | ||
| 47 | log.Printf("[%s] Crawling %d", name, current) | ||
| 48 | char, err := c.Crawl(current) | ||
| 49 | switch err { | ||
| 50 | case CharacterNotFound: | ||
| 51 | log.Printf("[%s] Char %d not found!", name, current) | ||
| 52 | faultCounter++ | ||
| 53 | case CrawlError: | ||
| 54 | panic(err) | ||
| 55 | default: | ||
| 56 | cData, _ := json.Marshal(char) | ||
| 57 | ioutil.WriteFile("data/"+name+"/"+strconv.Itoa(current)+".json", cData, 0755) | ||
| 58 | } | ||
| 59 | |||
| 60 | current++ | ||
| 61 | ioutil.WriteFile(name+".txt", []byte(strconv.Itoa(current)), os.ModePerm) | ||
| 62 | } | ||
| 63 | } | ||
| 64 | |||
| 65 | func init() { | ||
| 66 | Instances = append(Instances, new(ACDCrawler), new(ACDAnimeCrawler)) | ||
| 67 | } | ||
diff --git a/tools/charcrawler/main b/tools/charcrawler/main new file mode 100755 index 0000000..71f9896 --- /dev/null +++ b/tools/charcrawler/main | |||
| Binary files differ | |||
diff --git a/tools/charcrawler/main.go b/tools/charcrawler/main.go new file mode 100644 index 0000000..0c94151 --- /dev/null +++ b/tools/charcrawler/main.go | |||
| @@ -0,0 +1,16 @@ | |||
| 1 | package main | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "time" | ||
| 5 | |||
| 6 | "private/charcrawler/crawler" | ||
| 7 | ) | ||
| 8 | |||
| 9 | func main() { | ||
| 10 | for _, c := range crawler.Instances { | ||
| 11 | go crawler.Start(c) | ||
| 12 | } | ||
| 13 | for { | ||
| 14 | time.Sleep(time.Minute) | ||
| 15 | } | ||
| 16 | } | ||
