동적 웹페이지 크롤링

  • install chrome browser
sudo apt-get update
sudo apt-get install -y google-chrome-stable
  • download chrome driver matched with chrome browser
pip install chromedriver-autoinstaller
  • install selenium
pip install selenium
  • Check with python script
import os
from selenium import webdriver
import chromedriver_autoinstaller

chrome_ver = chromedriver_autoinstaller.get_chrome_version()
print(chrome_ver)
chromedriver_autoinstaller.install(True)
chromedriver_path = f'./{chrome_ver.split(".")[0]}/chromedriver.exe'

url = 'https://google.com'
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(3)

2023-08-14_19-04-dynamicwebpage

  • HTML source를 확인(Ctrl + U)해도 매장 목록이 없다.
  • 자세히 보기 javascript 함수 확인: javascript:storePop2(‘31’)

2023-08-14_19-10-dynamicwebpage2

2023-08-14_19-12-dynamicwebpage-detail

  • F12 개발자모드에서 요소 검사모드를 클릭

2023-08-14_20-22-static-popup

  • 실습 예제
import os, time
from selenium import webdriver
import chromedriver_autoinstaller
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import datetime


# chrome_ver = chromedriver_autoinstaller.get_chrome_version()
# chromedriver_autoinstaller.install(True)
# chromedriver_path = f'./{chrome_ver.split(".")[0]}/chromedriver.exe'
# url = 'https://www.coffeebeankorea.com/store/store.asp'
# wd = webdriver.Chrome()
# wd.get(url)
# wd.execute_script("storePop2('31');")
# time.sleep(3)
# html = wd.page_source
# soupCB = BeautifulSoup(html, 'html.parser')
# store_name_h2 = soupCB.select("div.store_txt>h2")[0].string
# store_info = soupCB.select("div.store_txt>table.store_table>tbody>tr>td")
# store_address_list = list(store_info[2])
# store_address = store_address_list[0]
# store_phone=store_info[3].string
# print(store_name_h2)
# print(store_address)
# print(store_phone)

# [CODE 1]
def CoffeeBean_store(result):
    CoffeeBean_URL = 'https://www.coffeebeankorea.com/store/store.asp'
    chrome_ver = chromedriver_autoinstaller.get_chrome_version() # chrome driver version should be same as chrome version
    chromedriver_autoinstaller.install(True)
    chromedriver_path = f'./{chrome_ver.split(".")[0]}/chromedriver.exe'
    wd = webdriver.Chrome()

    for i in range(1, 370):
        wd.get(CoffeeBean_URL)
        time.sleep(3) # wait page response. need to be changed
        try:
            wd.execute_script("storePop2(%s);" %str(i)) # javascript when click detail
            time.sleep(3)
            html = wd.page_source
            soupCB = BeautifulSoup(html, 'html.parser')
            store_name_h2 = soupCB.select("div.store_txt > h2") # check page source, F12
            store_name = store_name_h2[0].string
            print(store_name)
            store_info = soupCB.select("div.store_txt>table.store_table>tbody>tr>td")
            store_address_list = list(store_info[2])
            store_address = store_address_list[0]
            store_phone = store_info[3].string
            result.append([store_name]+[store_address]+[store_phone])
        except:
            continue
    return

# [CODE 0]
def main():
    result = []
    print('CoffeeBean store crawling >>>>>>>>>>>"')
    CoffeeBean_store(result) # [CODE 1]

    CB_tbl = pd.DataFrame(result, columns = ('store', 'address', 'phone'))
    CB_tbl.to_csv('./CoffeeBean.csv', encoding='cp949', mode ='w', index = True)

if __name__ == '__main__':
    main()

Reference