-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathMakefile
98 lines (87 loc) · 3.21 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
venv:
@echo "making a venv in ~/venv/whirlwind"
mkdir -p ~/venv
virtualenv -p python ~/venv/whirlwind
@echo
@echo "now you have to activate it:"
@echo "source ~/venv/whirlwind/bin/activate"
install:
pip install -r requirements.txt
iterate:
@echo iterating over all of the local warcs:
@echo
@echo warc:
python ./warcio-iterator.py whirlwind.warc.gz
@echo
@echo wet:
python ./warcio-iterator.py whirlwind.warc.wet.gz
@echo
@echo wat:
python ./warcio-iterator.py whirlwind.warc.wat.gz
@echo
cdxj:
@echo "creating *.cdxj index files from the local warcs"
cdxj-indexer whirlwind.warc.gz > whirlwind.warc.cdxj
cdxj-indexer --records conversion whirlwind.warc.wet.gz > whirlwind.warc.wet.cdxj
cdxj-indexer whirlwind.warc.wat.gz > whirlwind.warc.wat.cdxj
extract:
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
warcio extract --payload whirlwind.warc.gz 1023 > extraction.html
warcio extract --payload whirlwind.warc.wet.gz 466 > extraction.txt
warcio extract --payload whirlwind.warc.wat.gz 443 > extraction.json
@echo "hint: python -m json.tool extraction.json"
cdx_toolkit:
@echo look up this capture in the comoncrawl cdx index
#cdxt --cc --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
@echo
@echo extract the content from the commoncrawl s3 bucket
rm -f TEST-000000.extracted.warc.gz
cdxt --limit 1 --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
@echo
@echo index this new warc
cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
cat TEST-000000.extracted.warc.cdxj
@echo
@echo iterate this new warc
python ./warcio-iterator.py TEST-000000.extracted.warc.gz
@echo
download_collinfo:
@echo "downloading collinfo.json so we can find out the crawl name"
curl -O https://index.commoncrawl.org/collinfo.json
CC-MAIN-2024-22.warc.paths.gz:
@echo "downloading the list from s3, requires s3 auth even though it is free"
@echo "note that this file should be in the repo"
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
duck_local_files:
@echo "warning! 300 gigabyte download"
python duck.py local_files
duck_ccf_local_files:
@echo "warning! only works on Common Crawl Foundadtion's development machine"
python duck.py ccf_local_files
duck_cloudfront:
@echo "warning! this might take 1-10 minutes"
python duck.py cloudfront
wreck_the_warc:
@echo
@echo we will break and then fix this warc
cp whirlwind.warc.gz testing.warc.gz
rm -f testing.warc
gunzip testing.warc.gz
@echo
@echo iterate over this uncompressed warc: works
python ./warcio-iterator.py testing.warc
@echo
@echo compress it the wrong way
gzip testing.warc
@echo
@echo iterating over this compressed warc fails
python ./warcio-iterator.py testing.warc.gz || /usr/bin/true
@echo
@echo "now let's do it the right way"
gunzip testing.warc.gz
warcio recompress testing.warc testing.warc.gz
@echo
@echo and now iterating works
python ./warcio-iterator.py testing.warc.gz
@echo