2016年11月19日 星期六

[Crawler] Get Title from web-ptt

[Crawler] Get Title from web-ptt
# Python3.5
# OS: Windows7
# IDE: PyCharm
# 擷取Ptt電影版文章之日期、作者、標題
# http://jialin128.pixnet.net/blog/post/106647560-%5Bpython%5D-ptt-crawler-in-python-%E4%BD%BF%E7%94%A8python%E7%88%AC%E6%89%B9%E8%B8%A2%E8%B8%A2
import urllib.request
# http://stackoverflow.com/questions/33331850/importerror-no-module-named-bs4-in-windows
from bs4 import BeautifulSoup
# http://stackoverflow.com/questions/16627227/http-error-403-in-python-3-web-scraping
from urllib.request import Request, urlopen
req = Request('https://www.ptt.cc/bbs/movie/index.html', headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(req).read()
soup = BeautifulSoup(response,"html.parser")
container = soup.select('.r-ent')
for each_item in container:
print ("日期:"+each_item.select('div.date')[0].text, "作者:"+each_item.select('div.author')[0].text)
print (each_item.select('div.title')[0].text)
print ("---------------------------------")
Share:

Related Posts:

0 意見:

張貼留言