From dcc28883c43c42e1f56228559c463f8352b5441a Mon Sep 17 00:00:00 2001 From: Hai Liang Wang Date: Sun, 29 Jan 2017 23:17:08 +0800 Subject: [PATCH 1/3] Delete *.pyc --- subtitle/subtitle_crawler/__init__.pyc | Bin 172 -> 0 bytes subtitle/subtitle_crawler/items.pyc | Bin 396 -> 0 bytes subtitle/subtitle_crawler/pipelines.pyc | Bin 835 -> 0 bytes subtitle/subtitle_crawler/settings.pyc | Bin 723 -> 0 bytes subtitle/subtitle_crawler/spiders/__init__.pyc | Bin 180 -> 0 bytes .../subtitle_crawler/spiders/subtitle_spider.pyc | Bin 8045 -> 0 bytes 6 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 subtitle/subtitle_crawler/__init__.pyc delete mode 100644 subtitle/subtitle_crawler/items.pyc delete mode 100644 subtitle/subtitle_crawler/pipelines.pyc delete mode 100644 subtitle/subtitle_crawler/settings.pyc delete mode 100644 subtitle/subtitle_crawler/spiders/__init__.pyc delete mode 100644 subtitle/subtitle_crawler/spiders/subtitle_spider.pyc diff --git a/subtitle/subtitle_crawler/__init__.pyc b/subtitle/subtitle_crawler/__init__.pyc deleted file mode 100644 index fbf82db4985a25407d9be084b7f620054f4241f1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 172 zcmZSn%**xriCK6u0~9at$Neli diff --git a/subtitle/subtitle_crawler/items.pyc b/subtitle/subtitle_crawler/items.pyc deleted file mode 100644 index ca8dc59db9ebb87f6bff19e1a417c57b7e9df7a0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 396 zcmb79u};G<6gG|y{RxFA?=1rJmt862=ycvi3)@4y@IRAr(l^AXj} z57$FHB`y19Y@&C!UH$4^cOPAQ_(Adhh84~Uw5$|dhP4*IYV?PXQIt6~$R~t&L@?&1 zZk;i4nwoZFdoQgB!m6vcLmZRdHO|{8qx7ARl5M%M!_JEh5({T!^F`Q4HGD2v7Y)OP n`F1EodCUr8Fv(^r7so;%dCF8t^iSh7$>lEll}r^f!c6}HYGh9I diff --git a/subtitle/subtitle_crawler/pipelines.pyc b/subtitle/subtitle_crawler/pipelines.pyc deleted file mode 100644 index 097899cead0d82a7cd34051bf073bacf8fc32f56..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 835 zcmcIhO>5gg5S^85r!g%w^bpcR&N@~1Qc4^5Y=TWG^<|NzaU-*~#O|mjV4s|SmY>vf zAa7PC{RI)so6ntjno0k?8-IU#{8iHFOQF9(!bdI;6Bu zX$>+7RVf{lXb(>IVL*bqA?*irS&cM(fHS0Z5*wV+)zOgA0NxTT1M+XsekwYx^A7Kc z@`yI#=WvFI=g>Ln<2xB3vTUmRObzMttt(27nWp8f43XLSS~yyzYXWEbQ?VkjTSWu; zjc7Qub;W+B!{r>XsvX;Hv0+KnYcn842vb=3&%i$9t}gGoV*4lm!GGCduFofTh1@hU zX*wS`52tgy7`E&WleeeKgir0gyhY2_HzkK)mC_h~&>)$yxi%ln1$2L8U6zf^tQD=Z zcGFZH*xK29xc`a17TX!7knSkxYDQ+7sg7H0mTN907Q8+W(N%oufBtLa1D~D&VQIed diff --git a/subtitle/subtitle_crawler/settings.pyc b/subtitle/subtitle_crawler/settings.pyc deleted file mode 100644 index 234e50eb73bf468e60ce06a31725375c8416d85c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 723 zcmaJ;O>fgc5M9TOn}iQ>;81ZuD^96Y*-1*mfeMk>+hA&YO>C3mi{-{kvg$UrycdI!X;xKzxXbCQ4g~Lm4oKXq2{5(L!kl6>SuCJ0Pc8Tj9->;z}@i^jDHC z>inx}z; zusOTB;K~}FzA{foxO?#~_PrA<_%*{*wzw{jvAwFx8)glAX3rdqMt!s28{jDaoY#Ec z#?Rn(zI6h&0XPAa1BmWszaO{a6%%~Tq#5nkvVSiU?hn)fw7J1K4Z05?+BqBawOnR6yqcuxjgF5<%5?N`hE5AwAE9WIKTw1-&S~j59hRpLPNc zod+HrJ6Y`dgeI|i4jncP>|{0rfow)TBqWNXECfx5`eQdx&;C>5hP#{GnD|-fhQxCN z7_|)@9-U?$`AEDjB{ozjb&)MnKUw*&W!R}9%|1a~SV;01J z9$0Gu%7X%4c<}h&W(JgTFS|psXl32-EaD0Db7w7U&%|ma zjkaQy-ZH9h4vgDx+xF(LM=E`38=nVP`O&$9()X$N^g$;Xn5Z|

5hX?_0mnW0MgF8a*bM}D{H zNB(j*RqO7Cx)a6mTGV;$>JZ~QYZa@GNm64E|7;kUuCX-=^~|JE$2fhI#+ylRV62I{ zlSaemt~pdsMoJqq!_C7e)v8U3&k+~wIJSfbt9ZO^s`M}!XyrDshtaOz9m`t7JGdp& z@sb5@f)n~7YyKV0_V{tH(;Y>F4gZ>Ys^Wxg^jEr(xtf@jWW?M1d)MzTl{O#NmP~x@ zN+kMK$GBmEEwN_1(~nqj7x(avd$Zm_mM*Z?5c{~mUT5XCf#ram;NWsycGLa*X$aXiE*y&f{s)QCjx~8;hqy zdp_s((5@QV0E9M*VcrT`jWAY>qqN?ORl&{(?KJGI1J>Gl!Fe>sTHxv7fD$VJ*9tAkHR-*;+!+UDAX{d8yD= z+g#_ronU+2gdUgJ-lQXJmG4)6cJa%zBWAs6uVDS+lC<3(M19q6yC}BX{e(ubeYH`v zJ?0=QoZ88jqBbRc`!#T5wTV*G=Je9xVIJaK9@_0*ndh|sJkLV=jkE#Rf3TSJCcS(n z>-`^o^V$4VezM?_%rdLFh-VWpuhGmoAc%}e*?AE BM412p From 54d7c45273b11e6e5090289a9b2daddb5574562f Mon Sep 17 00:00:00 2001 From: Hai Liang Wang Date: Sun, 29 Jan 2017 23:18:02 +0800 Subject: [PATCH 2/3] Add save script --- .gitignore | 19 +++++++++++++++++++ subtitle/subtitle_crawler/save.sh | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 .gitignore create mode 100755 subtitle/subtitle_crawler/save.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e65de23 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# maven +chatbotv1/target/ + +# py +__pycache__/ +*.pyc + +# vim +*.swp +*.swo + +# idea +*.iml +*.ipr +*.iws +.idea/ + +# tmp files +subtitle/subtitle_crawler/result/ \ No newline at end of file diff --git a/subtitle/subtitle_crawler/save.sh b/subtitle/subtitle_crawler/save.sh new file mode 100755 index 0000000..3bda9d1 --- /dev/null +++ b/subtitle/subtitle_crawler/save.sh @@ -0,0 +1,19 @@ +#! /bin/bash +########################################### +# Save file with CURL +########################################### + +# constants +baseDir=$(cd `dirname "$0"`;pwd) +SAVE_PATH=$baseDir/result +# functions + +# main +[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return +if [ ! -d "$SAVE_PATH" ]; then + # Control will enter here if $DIRECTORY doesn't exist. + mkdir -p $SAVE_PATH +fi + +cd $SAVE_PATH +curl -JLO $1 From c8b00399392cd7bb3b7d3180e19b3a952cc108eb Mon Sep 17 00:00:00 2001 From: Hai Liang Wang Date: Sun, 29 Jan 2017 23:24:44 +0800 Subject: [PATCH 3/3] Download with save --- subtitle/README.md | 16 ++++++++++++++++ subtitle/subtitle_crawler/pipelines.py | 13 ++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 subtitle/README.md diff --git a/subtitle/README.md b/subtitle/README.md new file mode 100644 index 0000000..2aeb52a --- /dev/null +++ b/subtitle/README.md @@ -0,0 +1,16 @@ +# 爬取字幕语料 + +* 依赖curl +* 不支持 Windows +* Tested with py2.7 + +## Install +``` +pip install scrapy +``` + +## Download Data +``` +cd ChatBotCourse/subtitle/subtitle_crawler +scrapy crawl subtitle +``` \ No newline at end of file diff --git a/subtitle/subtitle_crawler/pipelines.py b/subtitle/subtitle_crawler/pipelines.py index 9577466..8faf29c 100644 --- a/subtitle/subtitle_crawler/pipelines.py +++ b/subtitle/subtitle_crawler/pipelines.py @@ -4,14 +4,17 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import os +from subprocess import call +DOWNLOAD_CMD = os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'save.sh') class SubtitleCrawlerPipeline(object): + ''' + Download Subtitle with curl + ''' def process_item(self, item, spider): url = item['url'] - file_name = url.replace('/','_').replace(':','_') - fp = open('result/'+file_name, 'w') - fp.write(item['body']) - fp.close() + call([DOWNLOAD_CMD, url]) return item -