-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path11-exercise-web-crawler.go
130 lines (111 loc) · 2.74 KB
/
11-exercise-web-crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// https://tour.golang.org/concurrency/10
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Cache URLs already fetched
type UrlCache struct {
urls map[string]bool
mux sync.Mutex
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, cache UrlCache, results chan string) {
// Each crawl thread gets its own results channel,
// which is closed automatically upon return
defer close(results)
// Start working on URL cache
cache.mux.Lock()
// Don't fetch the same URL twice.
// Also respect the depth limit
if cache.urls[url] || depth <= 0 {
cache.mux.Unlock()
return
} else {
cache.urls[url] = true
}
// Done working on URL cache
cache.mux.Unlock()
// Fetch
body, urls, err := fetcher.Fetch(url)
if err != nil {
results <- fmt.Sprintf("%s", err)
return
}
// Send result to channel
results <- fmt.Sprintf("found: %s %q\n", url, body)
// Create channels for additional results and then crawl the URLs
result := make([]chan string, len(urls))
for i, u := range urls {
result[i] = make(chan string)
go Crawl(u, depth-1, fetcher, cache, result[i])
}
// Print out the additional results to original results channel
for i := range result {
for s := range result[i] {
results <- s
}
}
}
func main() {
// Create results channel and cache map
results := make(chan string)
cache := UrlCache{urls: make(map[string]bool)}
// Crawl
go Crawl("http://golang.org/", 4, fetcher, cache, results)
// Print results as they are returned from Go threads
for i := range results {
fmt.Print(i)
}
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"http://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"http://golang.org/pkg/",
"http://golang.org/cmd/",
},
},
"http://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"http://golang.org/",
"http://golang.org/cmd/",
"http://golang.org/pkg/fmt/",
"http://golang.org/pkg/os/",
},
},
"http://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
"http://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
}