This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python3.5 | |
# OS: Windows7 | |
# IDE: PyCharm | |
# 擷取Ptt電影版文章之日期、作者、標題 | |
# http://jialin128.pixnet.net/blog/post/106647560-%5Bpython%5D-ptt-crawler-in-python-%E4%BD%BF%E7%94%A8python%E7%88%AC%E6%89%B9%E8%B8%A2%E8%B8%A2 | |
import urllib.request | |
# http://stackoverflow.com/questions/33331850/importerror-no-module-named-bs4-in-windows | |
from bs4 import BeautifulSoup | |
# http://stackoverflow.com/questions/16627227/http-error-403-in-python-3-web-scraping | |
from urllib.request import Request, urlopen | |
req = Request('https://www.ptt.cc/bbs/movie/index.html', headers={'User-Agent': 'Mozilla/5.0'}) | |
response = urlopen(req).read() | |
soup = BeautifulSoup(response,"html.parser") | |
container = soup.select('.r-ent') | |
for each_item in container: | |
print ("日期:"+each_item.select('div.date')[0].text, "作者:"+each_item.select('div.author')[0].text) | |
print (each_item.select('div.title')[0].text) | |
print ("---------------------------------") |
0 意見:
張貼留言