简单写了个,只是爬链接的,加上标题老报错,暂时没看出来原因,先给你粘上来吧(方法2无问题)
from BeautifulSoup import BeautifulSoup
import urllib2
import re
def grabHref(url,localfile):
html = urllib2.urlopen(url).read()
html = unicode(html,'gb2312','ignore').encode('utf-8','ignore')
content = BeautifulSoup(html).findAll('a')
myfile = open(localfile,'w')
pat = re.compile(r'href="([^"]*)"')
pat2 = re.compile(r'/tools/')
for item in content:
h = pat.search(str(item))
href = h.group(1)
if pat2.search(href):
# s = BeautifulSoup(item)
# myfile.write(s.a.string)
# myfile.write('\r\n')
myfile.write(href)
myfile.write('\r\n')
# print s.a.sting
print href
myfile.close()
def main():
url = "http://www.freebuf.com/tools"
localfile = 'aHref.txt'
grabHref(url,localfile)
if __name__=="__main__":
main()
方法2:Re版 由于方法1有问题,只能获取到下载页面链接,所以换用Re解决,代码如下:
import urllib2
import re
url = 'http://www.freebuf.com/tools'
find_re = re.compile(r'href="([^"]*)".+?>(.+?)</a>')
pat2 = re.compile(r'/tools/')
html = urllib2.urlopen(url).read()
html = unicode(html,'utf-8','ignore').encode('gb2312','ignore')
myfile = open('aHref.txt','w')
for x in find_re.findall(html):
if pat2.search(str(x)):
print >>myfile,x[0],x[1]
myfile.close()
print 'Done!'
import beautifulsoupimport urllib2
def main():
userMainUrl = "你要抓取的地址"
req = urllib2.Request(userMainUrl)
resp = urllib2.urlopen(req)
respHtml = resp.read()
foundLabel = respHtml.findAll("label")
finalL =foundLabel.string
print "biaoti=",finalL
if __name__=="__main__":
main()