2016年11月19日 星期六

[Crawler] web ptt

[Crawler] web ptt
# Python3.5
# OS: Windows7
# IDE: PyCharm
# http://jialin128.pixnet.net/blog/post/118830572-%5Bpython%5D-ptt-crawler-in-python-%E4%BD%BF%E7%94%A8python%E7%88%AC%E6%89%B9%E8%B8%A2%E8%B8%A2%EF%BC%88
# http://stackoverflow.com/questions/31147660/importerror-no-module-named-selenium
from selenium import webdriver
from bs4 import BeautifulSoup
# http://jialin128.pixnet.net/blog/post/114056630-%5bpython%5d--%E4%BD%BF%E7%94%A8selenium%E5%9C%A8google-chrome%E7%80%8F%E8%A6%BD%E5%99%A8
# https://sites.google.com/a/chromium.org/chromedriver/downloads
chrome_path = "C:\selenium_driver_chrome\chromedriver.exe" #chromedriver.exe執行檔所存在的路徑
web = webdriver.Chrome(chrome_path)
web.get('https://www.ptt.cc/bbs/movie/index.html') #打開瀏覽器進入指定頁面
num_page = int(input("請問您想擷取幾頁?"))
info = []
while(num_page > 0):
print("******************新頁面******************")
c_url = web.current_url #使用webdriver的current_url方法取得當前的網址
web.get(c_url) #瀏覽器轉跳至當前的網址
html = web.page_source
soup = BeautifulSoup(html,"html.parser") #讀進soup中
container = soup.select('.r-ent')
for each_item in container:
print ("日期:"+each_item.select('div.date')[0].text, "作者:"+each_item.select('div.author')[0].text)
info.append("日期:"+each_item.select('div.date')[0].text+"作者:"+each_item.select('div.author')[0].text+each_item.select('div.title')[0].text)
print (each_item.select('div.title')[0].text)
print ("---------------------------------")
num_page = num_page-1
web.find_element_by_link_text("‹ 上頁").click() #按頁面上的"‹ 上頁"link
web.close() #關閉瀏覽器
# Python3.5
# OS: Windows7
# IDE: PyCharm
# http://jialin128.pixnet.net/blog/post/118830572-%5Bpython%5D-ptt-crawler-in-python-%E4%BD%BF%E7%94%A8python%E7%88%AC%E6%89%B9%E8%B8%A2%E8%B8%A2%EF%BC%88
# http://stackoverflow.com/questions/31147660/importerror-no-module-named-selenium
from selenium import webdriver
from bs4 import BeautifulSoup
# http://jialin128.pixnet.net/blog/post/114056630-%5bpython%5d--%E4%BD%BF%E7%94%A8selenium%E5%9C%A8google-chrome%E7%80%8F%E8%A6%BD%E5%99%A8
# https://sites.google.com/a/chromium.org/chromedriver/downloads
chrome_path = "C:\selenium_driver_chrome\chromedriver.exe" #chromedriver.exe執行檔所存在的路徑
web = webdriver.Chrome(chrome_path)
web.get('https://www.ptt.cc/bbs/movie/index.html') #打開瀏覽器進入指定頁面
num_page =3 # num_page = int(input("請問您想擷取幾頁?"))
info = []
while(num_page > 0):
print("******************新頁面******************")
c_url = web.current_url #使用webdriver的current_url方法取得當前的網址
web.get(c_url) #瀏覽器轉跳至當前的網址
html = web.page_source
soup = BeautifulSoup(html,"html.parser") #讀進soup中
container = soup.select('.r-ent')
for each_item in container:
txt_key_word = each_item.select('div.title')[0].text
idx_key_word = int(txt_key_word.find('獸與牠們的產地中'))
if idx_key_word > 0:
print ("日期:"+each_item.select('div.date')[0].text, "作者:"+each_item.select('div.author')[0].text)
info.append("日期:"+each_item.select('div.date')[0].text+"作者:"+each_item.select('div.author')[0].text+each_item.select('div.title')[0].text)
print (each_item.select('div.title')[0].text)
print ("---------------------------------")
num_page = num_page-1
web.find_element_by_link_text("‹ 上頁").click() #按頁面上的"‹ 上頁"link
web.stop_client()
web.close() #關閉瀏覽器
web.quit() # http://sqa.stackexchange.com/questions/1941/how-do-i-close-the-browser-window-at-the-end-of-a-selenium-test

http://python.jobbole.com/87019/
Share:

Related Posts:

0 意見:

張貼留言