2 import re # 正则表达式
3 import bs4 # Beautiful Soup 4 解析模块
4 import urllib2 # 网络访问模块
5 import News #自己定义的新闻结构
6 import codecs #解决编码问题的关键 ,使用codecs.open打开文件
7 import sys #1解决不同页面编码问题
8
9 reload(sys) # 2
10 sys.setdefaultencoding('utf-8') # 3
11
12 # 从首页获取所有链接
13 def GetAllUrl(home):
14 html = urllib2.urlopen(home).read().decode('utf8')
15 soup = bs4.BeautifulSoup(html, 'html.parser')
16 pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+'
17 links = soup.find_all('a', href=re.compile(pattern))
18 for link in links:
19 url_set.add(link['href'])
20
21 def GetNews(url):
22 global NewsCount,MaxNewsCount #全局记录新闻数量
23 while len(url_set) != 0:
24 try:
25 # 获取链接
26 url = url_set.pop()
27 url_old.add(url)
28
29 # 获取代码
30 html = urllib2.urlopen(url).read().decode('utf8')
31
32 # 解析
33 soup = bs4.BeautifulSoup(html, 'html.parser')
34 pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+' # 链接匹配规则
35 links = soup.find_all('a', href=re.compile(pattern))
36
37 # 获取URL
38 for link in links:
39 if link['href'] not in url_old:
40 url_set.add(link['href'])
41
42 # 获取信息
43 article = News.News()
44 article.url = url # URL信息
45 page = soup.find('div', {'id': 'page'})
46 article.title = page.find('h1').get_text() # 标题信息
47 info = page.find('div', {'class': 'article-info'})
48 article.author = info.find('a', {'class': 'name'}).get_text() # 作者信息
49 article.date = info.find('span', {'class': 'time'}).get_text() # 日期信息
50 article.about = page.find('blockquote').get_text()
51 pnode = page.find('div', {'class': 'article-detail'}).find_all('p')
52 article.content = ''
53 for node in pnode: # 获取文章段落
54 article.content += node.get_text() + '\n' # 追加段落信息
55
56 SaveNews(article)
57
58 print NewsCount
59 break
60 except Exception as e:
61 print(e)
62 continue
63 else:
64 print(article.title)
65 NewsCount+=1
66 finally:
67 # 判断数据是否收集完成
68 if NewsCount == MaxNewsCount:
69 break
70
71 def SaveNews(Object):
72 file.write("【"+Object.title+"】"+"\t")
73 file.write(Object.author+"\t"+Object.date+"\n")
74 file.write(Object.content+"\n"+"\n")
75
76 url_set = set() # url集合
77 url_old = set() # 爬过的url集合
78
79 NewsCount = 0
80 MaxNewsCount=3
81
82 home = 'http://baijia.baidu.com/' # 起始位置
83
84 GetAllUrl(home)
85
86 file=codecs.open("D:\\test.txt","a+") #文件操作
87
88 for url in url_set:
89 GetNews(url)
90 # 判断数据是否收集完成
91 if NewsCount == MaxNewsCount:
92 break
93
94 file.close()
复制代码
新闻文章结构
复制代码
1 #coding: utf-8
2 # 文章类定义
3 class News(object):
4 def __init__(self):
5 self.url = None
6 self.title = None
7 self.author = None
8 self.date = None
9 self.about = None
10 self.content = None
可以使用python里面的一个爬虫库,beautifulsoup,这个库可以很方便的爬取数据。爬虫首先就得知道网页的链接,然后获取网页的源代码,通过正则表达式或者其他方法来获取所需要的内容,具体还是要对着网页源代码进行操作,查看需要哪些地方的数据,然后通过beautifulsoup来爬取特定html标签的内容。网上有很多相关的内容,可以看看。