Python
[Python] 웹 크롤링
열지희공
2022. 1. 14. 04:25
indeed와 stackoverflow 페이지에서 python을 검색했을 때 나오는 일자리의 이름, 회사, 위치, 지원링크를 스크래핑해와서 csv파일에 저장하는 웹스크래퍼를 만들었다. indeed 본사 페이지는 크롤링이 안되는 페이지라 한국 indeed 사이트를 이용하였다. request를 보내기 위해서 requests라이브러리를, 특정 태그들을 찾기 위해 BeautifulSoup라이브러리를 사용했다.
<main.py>
from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file
so_jobs = get_so_jobs()
indeed_jobs = get_indeed_jobs()
jobs = so_jobs + indeed_jobs
save_to_file(jobs)
<indeed.py>
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://kr.indeed.com/jobs?q=python&limit={LIMIT}"
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text,"html.parser")
pagination = soup.find("div",{"class":"pagination"}).find("ul")
list = pagination.find_all("li")
pages = []
for element in list[:-1]:
pages.append(int(element.get_text()))
max_page = pages[-1]
return max_page
def extract_job(html):
title = html.find("td",{"class":"resultContent"}).find("span",class_=False).string
company = html.find("td",{"class":"resultContent"}).find("div",{"class":"heading6 company_location tapItem-gutter"}).find("span").string
location = html.find("td",{"class":"resultContent"}).find("div",{"class":"heading6 company_location tapItem-gutter"}).find("div").string
job_id = html["data-jk"]
return {'title': title, 'company': company, 'location': location, 'link': f"https://kr.indeed.com/viewjob?jk={job_id}"}
def extract_jobs(last_page):
jobs = []
for page in range(last_page):
print(f"Scrapping INDEED :Page {page}")
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text,"html.parser")
results = soup.find("div",{"id":"mosaic-provider-jobcards"}).find_all('a',recursive=False)
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
<so.py>
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://stackoverflow.com/jobs?q=python"
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pages = soup.find("div",{"class":"s-pagination"}).find_all('a')
last_page = pages[-2].get_text(strip=True)
return int(last_page)
def extract_job(html):
title = html.find("h2").find("a")["title"]
company, location = html.find("h3").find_all("span", recursive=False)
company = company.get_text(strip=True)
location = location.get_text(strip=True)
#below code explains the above code. These code works same
##company_row = html.find("h3").find_all("span")
##company = company_row[0].get_text(strip=True)
##location = company_row[1].get_text(strip=True)
#same code2
##company = html.find("h3").find("span",class_=False).string
##location = html.find("h3").find("span",class_=True).string
job_id = html["data-jobid"]
return {'title': title, 'company': company, 'location': location, 'link': f"https://stackoverflow.com/jobs/{job_id}"}
def extract_jobs(last_page):
jobs = []
for page in range(last_page):
print(f"Scrapping SO :Page {page}")
result = requests.get(f"{URL}&pg={page+1}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div",{"class":"-job"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
<save.py>
import csv
def save_to_file(jobs):
file = open("jobs.csv", mode="w")
writer = csv.writer(file)
writer.writerow(["title", "company", "location", "link"])
for job in jobs:
writer.writerow(list(job.values()))
return