aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan C <jan@ruken.pw>2016-03-29 16:08:41 (UTC)
committerJan C <jan@ruken.pw>2016-03-29 16:08:41 (UTC)
commitb601ea312d2c08560fcb70312536e6ba61d09bf4 (patch)
tree6490675fc6ee77ceacf1b760eea1d5380cbb0f7c
parente0e59e9ca817f1e3c2091cf1b8ac370419fec8aa (diff)
parentae4f7f89849c4e94b388aabdb28d8f9f110df3c2 (diff)
Merge branch 'master' of ssh://projekte.fagott.pw/grilist
-rw-r--r--tools/charcrawler/.gitignore2
-rw-r--r--tools/charcrawler/crawler/acd.go84
-rw-r--r--tools/charcrawler/crawler/acd_anime.go44
-rw-r--r--tools/charcrawler/crawler/anidb.go69
-rw-r--r--tools/charcrawler/crawler/crawler.go70
-rwxr-xr-xtools/charcrawler/mainbin0 -> 9374264 bytes
-rw-r--r--tools/charcrawler/main.go16
7 files changed, 285 insertions, 0 deletions
diff --git a/tools/charcrawler/.gitignore b/tools/charcrawler/.gitignore
new file mode 100644
index 0000000..02a07a1
--- /dev/null
+++ b/tools/charcrawler/.gitignore
@@ -0,0 +1,2 @@
1data/
2*.txt
diff --git a/tools/charcrawler/crawler/acd.go b/tools/charcrawler/crawler/acd.go
new file mode 100644
index 0000000..9a37b95
--- /dev/null
+++ b/tools/charcrawler/crawler/acd.go
@@ -0,0 +1,84 @@
1package crawler
2
3import (
4 "log"
5 "strconv"
6 "strings"
7
8 "github.com/PuerkitoBio/goquery"
9)
10
11type ACDCrawler struct{}
12
13func (a ACDCrawler) Name() string {
14 return "ACD"
15}
16
17func (a ACDCrawler) Crawl(id int) (CharacterData, error) {
18 c := make(CharacterData)
19 doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/character.php?id=" + strconv.Itoa(id))
20 if err != nil {
21 log.Println(err)
22 return nil, CrawlError
23 }
24 text := doc.Text()
25 if strings.Contains(text, "bad character : try") {
26 return nil, CharacterNotFound
27 }
28 if strings.Contains(text, "Your IP has been blocked") {
29 return nil, Banned
30 }
31 dataTr := doc.Find("#besttable").Next().ChildrenFiltered("tbody").ChildrenFiltered("tr")
32 leftSide := dataTr.Children().Eq(0)
33 rightSide := dataTr.Children().Eq(1)
34 imageCols := rightSide.ChildrenFiltered("table").First().Find("tbody > tr > td")
35 val, _ := imageCols.Eq(0).Find("img").Attr("src")
36 c["__thumb"] = val
37 val, _ = imageCols.Eq(1).Find("a").Attr("href")
38 c["__img"] = val
39 leftSide.ChildrenFiltered("table").Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) {
40 c[s.Find("th").Text()] = s.Find("td").Text()
41 })
42 var key string
43 leftSide.
44 ChildrenFiltered("table").
45 Eq(0).
46 Find("td").
47 Eq(1).
48 Find("dl").
49 Children().
50 Each(func(i int, s *goquery.Selection) {
51 switch goquery.NodeName(s) {
52 case "dt":
53 key = s.Text()
54 case "dd":
55 c[key] = s.Text()
56 }
57 })
58 tags := make([]string, 0)
59 leftSide.ChildrenFiltered("div").Eq(0).Find("a").Each(func(i int, s *goquery.Selection) {
60 tags = append(tags, s.Text())
61 })
62 c["__tags"] = tags
63 vas := make([]string, 0)
64 leftSide.ChildrenFiltered("div").Eq(1).Find("a").Each(func(i int, s *goquery.Selection) {
65 vas = append(vas, s.Text())
66 })
67 c["__vas"] = vas
68 leftSide.ChildrenFiltered("dl").Children().Each(func(i int, s *goquery.Selection) {
69 switch goquery.NodeName(s) {
70 case "dt":
71 key = s.Text()
72 case "dd":
73 c[key] = s.Text()
74 }
75 })
76 apps := make([]int, 0)
77 rightSide.Find(".tile3top").Each(func(i int, s *goquery.Selection) {
78 val, _ = s.Find("a").Attr("href")
79 id, _ := strconv.Atoi(strings.Split(val, "=")[1])
80 apps = append(apps, id)
81 })
82 c["__appearances"] = apps
83 return c, nil
84}
diff --git a/tools/charcrawler/crawler/acd_anime.go b/tools/charcrawler/crawler/acd_anime.go
new file mode 100644
index 0000000..c697b43
--- /dev/null
+++ b/tools/charcrawler/crawler/acd_anime.go
@@ -0,0 +1,44 @@
1package crawler
2
3import (
4 "log"
5 "strconv"
6 "strings"
7
8 "github.com/PuerkitoBio/goquery"
9)
10
11type ACDAnimeCrawler struct{}
12
13func (a ACDAnimeCrawler) Name() string {
14 return "ACDAnime"
15}
16
17func (a ACDAnimeCrawler) Crawl(id int) (CharacterData, error) {
18 c := make(CharacterData)
19 doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/series.php?id=" + strconv.Itoa(id))
20 if err != nil {
21 log.Println(err)
22 return nil, CrawlError
23 }
24 text := doc.Text()
25 if strings.Contains(text, "bad series : try") {
26 return nil, CharacterNotFound
27 }
28 if strings.Contains(text, "Your IP has been blocked") {
29 return nil, Banned
30 }
31 tds := doc.Find("#besttable > table > tbody > tr > td")
32 val, _ := tds.Eq(0).Find("img").Attr("src")
33 c["__thumb"] = val
34 tds.Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) {
35 key := s.Find("th").Text()
36 value := s.Find("td").Text()
37 if key == "Home Page" {
38 val, _ = s.Find("td > a").Attr("href")
39 value = val
40 }
41 c[key] = value
42 })
43 return c, nil
44}
diff --git a/tools/charcrawler/crawler/anidb.go b/tools/charcrawler/crawler/anidb.go
new file mode 100644
index 0000000..12bf05b
--- /dev/null
+++ b/tools/charcrawler/crawler/anidb.go
@@ -0,0 +1,69 @@
1package crawler
2
3import (
4 "log"
5 "strconv"
6 "strings"
7
8 "github.com/PuerkitoBio/goquery"
9)
10
11type AniDBCrawler struct{}
12
13func (a AniDBCrawler) Name() string {
14 return "AniDB"
15}
16
17func (a AniDBCrawler) Crawl(id int) (CharacterData, error) {
18 c := make(CharacterData)
19 doc, err := goquery.NewDocument("http://anidb.net/perl-bin/animedb.pl?show=character&charid=" + strconv.Itoa(id))
20 if err != nil {
21 log.Println(err)
22 return nil, CrawlError
23 }
24 siteErr, _ := doc.Find(".error p").Html()
25 if strings.Contains(siteErr, "Unknown character id") {
26 return nil, CharacterNotFound
27 } else if strings.Contains(siteErr, "BANNED") {
28 return nil, CrawlError
29 }
30 first := true
31 links := make([]string, 0)
32 doc.Find(".characters").Each(func(i int, s *goquery.Selection) {
33 if first {
34 s.Find(".g_definitionlist tr").Each(func(i int, s *goquery.Selection) {
35 keyHtml, _ := s.Find("th").Html()
36 valueHtml, _ := s.Find("td").Html()
37 key := strings.TrimSpace(keyHtml)
38 value := strings.TrimSpace(valueHtml)
39 c[key] = value
40 })
41 desc, _ := s.Find(".desc").Html()
42 c["__desc"] = strings.TrimSpace(desc)
43 imgUrl, _ := s.Find(".image img").Attr("src")
44 c["__img"] = imgUrl
45 animes := make([]map[string]string, 0)
46 s.Find(".animelist tr").Each(func(i int, s *goquery.Selection) {
47 if s.HasClass("header") {
48 return
49 }
50 anime := make(map[string]string)
51 relHtml, _ := s.Find(".relation").Html()
52 nameHtml, _ := s.Find(".name a").Html()
53 rel := strings.TrimSpace(relHtml)
54 name := strings.TrimSpace(nameHtml)
55 anime["rel"] = rel
56 anime["name"] = name
57 animes = append(animes, anime)
58 })
59 c["__animes"] = animes
60 } else {
61 link, _ := s.Find(".mainname a").Attr("href")
62 links = append(links, strings.Replace(link, "http://anidb.net/ch", "", 1))
63 }
64 first = false
65 })
66 c["__links"] = links
67
68 return c, nil
69}
diff --git a/tools/charcrawler/crawler/crawler.go b/tools/charcrawler/crawler/crawler.go
new file mode 100644
index 0000000..3d27ea2
--- /dev/null
+++ b/tools/charcrawler/crawler/crawler.go
@@ -0,0 +1,70 @@
1package crawler
2
3import (
4 "encoding/json"
5 "errors"
6 "io/ioutil"
7 "log"
8 "os"
9 "strconv"
10 "strings"
11 "time"
12)
13
14type CharacterData map[string]interface{}
15
16type Crawler interface {
17 Name() string
18 Crawl(id int) (CharacterData, error)
19}
20
21var Instances []Crawler
22
23var (
24 CrawlError = errors.New("Error while crawling")
25 CharacterNotFound = errors.New("Character not found")
26 Banned = errors.New("Crawler banned from source")
27)
28
29func Start(c Crawler) {
30 name := c.Name()
31 os.MkdirAll("data/"+name, 0755)
32 log.Printf("Starting Crawler %s...", name)
33 ticker := time.NewTicker(time.Second * 1)
34 current := 1
35 if save, err := ioutil.ReadFile(name + ".txt"); err == nil {
36 s := strings.TrimSpace(string(save))
37 if i, err := strconv.Atoi(s); err == nil {
38 current = i
39 }
40 }
41 faultCounter := 0
42 for range ticker.C {
43 if faultCounter > 100 {
44 faultCounter = 0
45 log.Printf("[%s] Exiting after 100 fails", name)
46 break
47 }
48 log.Printf("[%s] Crawling %d", name, current)
49 char, err := c.Crawl(current)
50 switch err {
51 case CharacterNotFound:
52 log.Printf("[%s] Char %d not found!", name, current)
53 faultCounter++
54 case CrawlError:
55 panic(err)
56 case Banned:
57 panic(err)
58 default:
59 cData, _ := json.Marshal(char)
60 ioutil.WriteFile("data/"+name+"/"+strconv.Itoa(current)+".json", cData, 0755)
61 }
62
63 current++
64 ioutil.WriteFile(name+".txt", []byte(strconv.Itoa(current)), os.ModePerm)
65 }
66}
67
68func init() {
69 Instances = append(Instances, new(ACDCrawler), new(ACDAnimeCrawler))
70}
diff --git a/tools/charcrawler/main b/tools/charcrawler/main
new file mode 100755
index 0000000..71f9896
--- /dev/null
+++ b/tools/charcrawler/main
Binary files differ
diff --git a/tools/charcrawler/main.go b/tools/charcrawler/main.go
new file mode 100644
index 0000000..0c94151
--- /dev/null
+++ b/tools/charcrawler/main.go
@@ -0,0 +1,16 @@
1package main
2
3import (
4 "time"
5
6 "private/charcrawler/crawler"
7)
8
9func main() {
10 for _, c := range crawler.Instances {
11 go crawler.Start(c)
12 }
13 for {
14 time.Sleep(time.Minute)
15 }
16}