输入url,得到html,我早就写了函数了
自己搜:
getUrlRespHtml
就可以找到对应的python函数:
#------------------------------------------------------------------------------
def getUrlResponse(url, postDict={}, headerDict={}, timeout=0, useGzip=False, postDataDelimiter="&") :
"""Get response from url, support optional postDict,headerDict,timeout,useGzip
Note:
1. if postDict not null, url request auto become to POST instead of default GET
2 if you want to auto handle cookies, should call initAutoHandleCookies() before use this function.
then following urllib2.Request will auto handle cookies
"""
# makesure url is string, not unicode, otherwise urllib2.urlopen will error
url = str(url)
if (postDict) :
if(postDataDelimiter=="&"):
postData = urllib.urlencode(postDict)
else:
postData = ""
for eachKey in postDict.keys() :
postData += str(eachKey) + "=" + str(postDict[eachKey]) + postDataDelimiter
postData = postData.strip()
logging.info("postData=%s", postData)
req = urllib2.Request(url, postData)
logging.info("req=%s", req)
req.add_header('Content-Type', "application/x-www-form-urlencoded")
else :
req = urllib2.Request(url)
defHeaderDict = {
'User-Agent' : gConst['UserAgent'],
'Cache-Control' : 'no-cache',
'Accept' : '*/*',
'Connection' : 'Keep-Alive',
}
# add default headers firstly
for eachDefHd in defHeaderDict.keys() :
#print "add default header: %s=%s"%(eachDefHd,defHeaderDict[eachDefHd])
req.add_header(eachDefHd, defHeaderDict[eachDefHd])
if(useGzip) :
#print "use gzip for",url
req.add_header('Accept-Encoding', 'gzip, deflate')
# add customized header later -> allow overwrite default header
if(headerDict) :
#print "added header:",headerDict
for key in headerDict.keys() :
req.add_header(key, headerDict[key])
if(timeout > 0) :
# set timeout value if necessary
resp = urllib2.urlopen(req, timeout=timeout)
else :
resp = urllib2.urlopen(req)
#update cookies into local file
if(gVal['cookieUseFile']):
gVal['cj'].save()
logging.info("gVal['cj']=%s", gVal['cj'])
return resp
#------------------------------------------------------------------------------
# get response html==body from url
#def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :
def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True, postDataDelimiter="&") :
resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip, postDataDelimiter)
respHtml = resp.read()
#here, maybe, even if not send Accept-Encoding: gzip, deflate
#but still response gzip or deflate, so directly do undecompress
#if(useGzip) :
#print "---before unzip, len(respHtml)=",len(respHtml)
respInfo = resp.info()
# Server: nginx/1.0.8
# Date: Sun, 08 Apr 2012 12:30:35 GMT
# Content-Type: text/html
# Transfer-Encoding: chunked
# Connection: close
# Vary: Accept-Encoding
# ...
# Content-Encoding: gzip
# sometime, the request use gzip,deflate, but actually returned is un-gzip html
# -> response info not include above "Content-Encoding: gzip"
# eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html
# -> so here only decode when it is indeed is gziped data
#Content-Encoding: deflate
if("Content-Encoding" in respInfo):
if("gzip" == respInfo['Content-Encoding']):
respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS)
elif("deflate" == respInfo['Content-Encoding']):
respHtml = zlib.decompress(respHtml, -zlib.MAX_WBITS)
return respHtml
及示例代码:
url = "http://www.crifan.com"respHtml = getUrlRespHtml(url)
完全库函数,自己搜:
crifanLib.py
关于抓取动态页面,详见:
Python专题教程:抓取网站,模拟登陆,抓取动态网页
(自己搜标题即可找到)