和上一篇文章基本雷同
__author__ = 'jtahstu' import urllib2 import re import sys import time pid=1247 for i in xrange(1,pid): url = 'http://acm.nyist.net/JudgeOnline/problem.php?pid='+str(i) headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } try: request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read() noproblem=re.search('题目尚未公开',content) if not noproblem: patten=re.compile('.*?class="problem-display".*?</H4>(.*?)<DL class="problem-submit">',re.S) proList=re.findall(patten,content) csv=open('nyojproblemlist.csv','ab+') for res in proList: data='"'+str(i)+'","'+res+'"' csv.write(data+"\r\n") print data csv.close() except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason time.sleep(1)
CREATE TABLE `nyojproblems` ( `pid` int(11) NOT NULL, `content` text CHARACTER SET utf8, PRIMARY KEY (`pid`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1
csv文件导入时有一百多条出错,成功导入1072条数据
[Finished in 1861.9s]
抓取耗时31分钟,去掉睡眠时间,抓取一千多道题,用时10分钟
---
本文章采用 知识共享署名2.5中国大陆许可协议 进行许可,转载必须注明作者和本文链接。
---
发表评论