Python提取网页链接和标题

Python017

Python提取网页链接和标题,第1张

方法1:BS版

简单写了个,只是爬链接的,加上标题老报错,暂时没看出来原因,先给你粘上来吧(方法2无问题)

from BeautifulSoup import BeautifulSoup

import urllib2

import re

def grabHref(url,localfile):

html = urllib2.urlopen(url).read()

html = unicode(html,'gb2312','ignore').encode('utf-8','ignore')

content = BeautifulSoup(html).findAll('a')

myfile = open(localfile,'w')

pat = re.compile(r'href="([^"]*)"')

pat2 = re.compile(r'/tools/')

for item in content:

h = pat.search(str(item))

href = h.group(1)

if pat2.search(href):

# s = BeautifulSoup(item)

# myfile.write(s.a.string)

# myfile.write('\r\n')

myfile.write(href)

myfile.write('\r\n')

# print s.a.sting

print href

myfile.close()

def main():

url = "http://www.freebuf.com/tools"

localfile = 'aHref.txt'

grabHref(url,localfile)

if __name__=="__main__":

main()

方法2:Re版 由于方法1有问题,只能获取到下载页面链接,所以换用Re解决,代码如下:

import urllib2

import re

url = 'http://www.freebuf.com/tools'

find_re = re.compile(r'href="([^"]*)".+?>(.+?)</a>')

pat2 = re.compile(r'/tools/')

html = urllib2.urlopen(url).read()

html = unicode(html,'utf-8','ignore').encode('gb2312','ignore')

myfile = open('aHref.txt','w')

for x in find_re.findall(html):

if pat2.search(str(x)):

print >>myfile,x[0],x[1]

myfile.close()

print 'Done!'

import beautifulsoup

import urllib2

def main():

userMainUrl = "你要抓取的地址"

req = urllib2.Request(userMainUrl)

resp = urllib2.urlopen(req)

respHtml = resp.read()

foundLabel = respHtml.findAll("label")

finalL =foundLabel.string

print "biaoti=",finalL

if __name__=="__main__":

main()