代码像这样
__author__ = 'jtahstu' import urllib2 import re import sys page=13 for i in xrange(1,page): url = 'http://acm.nyist.net/JudgeOnline/problemset.php?page='+str(i) headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } try: request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read() # print content patten=re.compile('<TR.*?<TD>.*?<TD>(.*?)</TD>.*?<TD>(.*?)</TD>.*?probname tal.*?<a.*?">(.*?)</a>.*?<TD width="55" >(.*?) %</TD>.*?tal">\((.*?)/(.*?)\)</TD>',re.S) proList=re.findall(patten,content) # 0 题号 # 1 难度 # 2 标题 # 3 AC率 # 4 总AC次数 # 5 总提交次数 csv=open('naojproblem.csv','ab+') for res in proList: data='"'+res[0]+'","'+res[1]+'","'+res[2]+'","'+res[3]+'","'+res[4]+'","'+res[5]+'"' csv.write(data+"\r\n") print data csv.close() except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason
去掉注释24行,生成csv格式文件,然后需要导入数据库,表结构如下
CREATE TABLE `nyojproblemlist` ( `pid` int(11) NOT NULL, `difficult` int(11) DEFAULT NULL, `title` varchar(255) CHARACTER SET utf8 DEFAULT NULL, `ratio` int(11) DEFAULT NULL, `ac` int(11) DEFAULT NULL, `submit` int(11) DEFAULT NULL, PRIMARY KEY (`pid`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
共有1141条数据,csv文件如下
---
本文章采用 知识共享署名2.5中国大陆许可协议 进行许可,转载必须注明作者和本文链接。
---
发表评论