这个脚本是根据图书的isbn号来抓取对应的图书评分信息的。data.csv是一个包含图书的isbn的列表文件,每一行是一个图书的isbn号。 这个版本只使用了单线程抓取,并且只能从csv文件中读取数据。因为是朋友拜托抓取的,数据量不大,以后如果有需要再慢慢改进好了。 整个Python脚本很简单,主要使用了BeautifulSoup进行html的内容提取。
P.S.最近手头还有几个非常有趣的项目正在制作,希望我能够早日搞定它们 :)
import urllib,urllib2
import re
import BeautifulSoup
def isbn_2_score(isbn):
url = 'http://www.douban.com/subject_search?search_text='
try:
response = urllib2.urlopen(url+isbn)
except Exception,e:
return 0.0
doc = response.read()
soup = BeautifulSoup.BeautifulSoup(''.join(doc))
try:
book_info = soup.find("a",{"class":"nbg"})
except Exception,e:
return 0.0
if isinstance(book_info,BeautifulSoup.Tag):
url_book_info = book_info['href']
try:
response = urllib2.urlopen(url_book_info)
except Exception,e:
return 0.0
book_page = response.read()
soup = BeautifulSoup.BeautifulSoup(''.join(book_page))
score_info = soup.find('strong','ll rating_num')
if isinstance(book_info,BeautifulSoup.Tag):
score = score_info.string
return score
return 0.0
return 0.0
def read_file(file_name):
file_handler = open(file_name,'r')
return file_handler
def return_isbn(file_handler):
isbn = file_handler.readline()
return isbn
if __name__ == '__main__':
data = read_file('data.csv')
f = open('dump','w')
k = return_isbn(data)
while k is not None:
score = isbn_2_score(k)
result = k[0:-1]+":"+str(score)+"\n"
print result
f.write(result)
k=return_isbn(data)
f.close()
项目地址:https://github.com/quake0day/douban_crawler