This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python3.5 | |
# OS: Windows7 | |
# IDE: PyCharm | |
# http://jialin128.pixnet.net/blog/post/118830572-%5Bpython%5D-ptt-crawler-in-python-%E4%BD%BF%E7%94%A8python%E7%88%AC%E6%89%B9%E8%B8%A2%E8%B8%A2%EF%BC%88 | |
# http://stackoverflow.com/questions/31147660/importerror-no-module-named-selenium | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
# http://jialin128.pixnet.net/blog/post/114056630-%5bpython%5d--%E4%BD%BF%E7%94%A8selenium%E5%9C%A8google-chrome%E7%80%8F%E8%A6%BD%E5%99%A8 | |
# https://sites.google.com/a/chromium.org/chromedriver/downloads | |
chrome_path = "C:\selenium_driver_chrome\chromedriver.exe" #chromedriver.exe執行檔所存在的路徑 | |
web = webdriver.Chrome(chrome_path) | |
web.get('https://www.ptt.cc/bbs/movie/index.html') #打開瀏覽器進入指定頁面 | |
num_page = int(input("請問您想擷取幾頁?")) | |
info = [] | |
while(num_page > 0): | |
print("******************新頁面******************") | |
c_url = web.current_url #使用webdriver的current_url方法取得當前的網址 | |
web.get(c_url) #瀏覽器轉跳至當前的網址 | |
html = web.page_source | |
soup = BeautifulSoup(html,"html.parser") #讀進soup中 | |
container = soup.select('.r-ent') | |
for each_item in container: | |
print ("日期:"+each_item.select('div.date')[0].text, "作者:"+each_item.select('div.author')[0].text) | |
info.append("日期:"+each_item.select('div.date')[0].text+"作者:"+each_item.select('div.author')[0].text+each_item.select('div.title')[0].text) | |
print (each_item.select('div.title')[0].text) | |
print ("---------------------------------") | |
num_page = num_page-1 | |
web.find_element_by_link_text("‹ 上頁").click() #按頁面上的"‹ 上頁"link | |
web.close() #關閉瀏覽器 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python3.5 | |
# OS: Windows7 | |
# IDE: PyCharm | |
# http://jialin128.pixnet.net/blog/post/118830572-%5Bpython%5D-ptt-crawler-in-python-%E4%BD%BF%E7%94%A8python%E7%88%AC%E6%89%B9%E8%B8%A2%E8%B8%A2%EF%BC%88 | |
# http://stackoverflow.com/questions/31147660/importerror-no-module-named-selenium | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
# http://jialin128.pixnet.net/blog/post/114056630-%5bpython%5d--%E4%BD%BF%E7%94%A8selenium%E5%9C%A8google-chrome%E7%80%8F%E8%A6%BD%E5%99%A8 | |
# https://sites.google.com/a/chromium.org/chromedriver/downloads | |
chrome_path = "C:\selenium_driver_chrome\chromedriver.exe" #chromedriver.exe執行檔所存在的路徑 | |
web = webdriver.Chrome(chrome_path) | |
web.get('https://www.ptt.cc/bbs/movie/index.html') #打開瀏覽器進入指定頁面 | |
num_page =3 # num_page = int(input("請問您想擷取幾頁?")) | |
info = [] | |
while(num_page > 0): | |
print("******************新頁面******************") | |
c_url = web.current_url #使用webdriver的current_url方法取得當前的網址 | |
web.get(c_url) #瀏覽器轉跳至當前的網址 | |
html = web.page_source | |
soup = BeautifulSoup(html,"html.parser") #讀進soup中 | |
container = soup.select('.r-ent') | |
for each_item in container: | |
txt_key_word = each_item.select('div.title')[0].text | |
idx_key_word = int(txt_key_word.find('獸與牠們的產地中')) | |
if idx_key_word > 0: | |
print ("日期:"+each_item.select('div.date')[0].text, "作者:"+each_item.select('div.author')[0].text) | |
info.append("日期:"+each_item.select('div.date')[0].text+"作者:"+each_item.select('div.author')[0].text+each_item.select('div.title')[0].text) | |
print (each_item.select('div.title')[0].text) | |
print ("---------------------------------") | |
num_page = num_page-1 | |
web.find_element_by_link_text("‹ 上頁").click() #按頁面上的"‹ 上頁"link | |
web.stop_client() | |
web.close() #關閉瀏覽器 | |
web.quit() # http://sqa.stackexchange.com/questions/1941/how-do-i-close-the-browser-window-at-the-end-of-a-selenium-test |
http://python.jobbole.com/87019/
0 意見:
張貼留言