Python通过HTTP协议定期抓取文件
分类:
Python
2007-07-26 18:56
698人阅读
评论(1)
收藏
举报
可以扩充成为简单的抓取工具,定时抓取
#!usr/bin/python
importurllib2,time;
classErrorHandler(urllib2.HTTPDefaultErrorHandler):
defhttp_error_default(self,req,fp,code,msg,headers):
result=urllib2.HTTPError(req.get_full_url(),code,msg,headers,fp)
result.status=code
returnresult
URL='http://www.ibm.com/developerworks/js/ajax1.js'
req=urllib2.Request(URL)
mgr=urllib2.build_opener(ErrorHandler())
whileTrue:
ns=mgr.open(req)
if(ns.headers.has_key('last-modified')):
modified=ns.headers.get('last-modified')
if(ns.code==304):
print'''
==============================
NOTMODIFIED
==============================
'''
elif(ns.code==200):
printns.read()
else:
print'thereisanerror';
if(notlocals().has_key('modified')):
modified=time.time();
req.add_header('If-Modified-Since',modified)
time.sleep(10)
importurllib2,time;
classErrorHandler(urllib2.HTTPDefaultErrorHandler):
defhttp_error_default(self,req,fp,code,msg,headers):
result=urllib2.HTTPError(req.get_full_url(),code,msg,headers,fp)
result.status=code
returnresult
URL='http://www.ibm.com/developerworks/js/ajax1.js'
req=urllib2.Request(URL)
mgr=urllib2.build_opener(ErrorHandler())
whileTrue:
ns=mgr.open(req)
if(ns.headers.has_key('last-modified')):
modified=ns.headers.get('last-modified')
if(ns.code==304):
print'''
==============================
NOTMODIFIED
==============================
'''
elif(ns.code==200):
printns.read()
else:
print'thereisanerror';
if(notlocals().has_key('modified')):
modified=time.time();
req.add_header('If-Modified-Since',modified)
time.sleep(10)
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。
评论(0)