Python

[Python] 웹 크롤링

열지희공 2022. 1. 14. 04:25

indeed와 stackoverflow 페이지에서 python을 검색했을 때 나오는 일자리의 이름, 회사, 위치, 지원링크를 스크래핑해와서 csv파일에 저장하는 웹스크래퍼를 만들었다. indeed 본사 페이지는 크롤링이 안되는 페이지라 한국 indeed 사이트를 이용하였다. request를 보내기 위해서 requests라이브러리를, 특정 태그들을 찾기 위해 BeautifulSoup라이브러리를 사용했다. 

<main.py>

from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file

so_jobs = get_so_jobs()
indeed_jobs = get_indeed_jobs()
jobs = so_jobs + indeed_jobs
save_to_file(jobs)

<indeed.py>

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://kr.indeed.com/jobs?q=python&limit={LIMIT}"

def get_last_page():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text,"html.parser")
  pagination = soup.find("div",{"class":"pagination"}).find("ul")
  list = pagination.find_all("li")
  pages = []
  for element in list[:-1]:
    pages.append(int(element.get_text()))
  max_page = pages[-1]
  return max_page

def extract_job(html):
  title = html.find("td",{"class":"resultContent"}).find("span",class_=False).string
  company = html.find("td",{"class":"resultContent"}).find("div",{"class":"heading6 company_location tapItem-gutter"}).find("span").string
  location = html.find("td",{"class":"resultContent"}).find("div",{"class":"heading6 company_location tapItem-gutter"}).find("div").string
  job_id = html["data-jk"]
  return {'title': title, 'company': company, 'location': location, 'link': f"https://kr.indeed.com/viewjob?jk={job_id}"}

def extract_jobs(last_page):
  jobs = []
  for page in range(last_page):
    print(f"Scrapping INDEED :Page {page}")
    result = requests.get(f"{URL}&start={page*LIMIT}")
    soup = BeautifulSoup(result.text,"html.parser")
    results = soup.find("div",{"id":"mosaic-provider-jobcards"}).find_all('a',recursive=False)
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  return jobs

def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return jobs

<so.py>

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://stackoverflow.com/jobs?q=python"


def get_last_page():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div",{"class":"s-pagination"}).find_all('a')
  last_page = pages[-2].get_text(strip=True)
  return int(last_page)

def extract_job(html):
  title = html.find("h2").find("a")["title"]
  company, location = html.find("h3").find_all("span", recursive=False)
  company = company.get_text(strip=True)
  location = location.get_text(strip=True)
  
  #below code explains the above code. These code works same  
  ##company_row = html.find("h3").find_all("span")
  ##company = company_row[0].get_text(strip=True)
  ##location = company_row[1].get_text(strip=True)

  #same code2
  ##company = html.find("h3").find("span",class_=False).string
  ##location = html.find("h3").find("span",class_=True).string
  job_id = html["data-jobid"]
  
  return {'title': title, 'company': company, 'location': location, 'link': f"https://stackoverflow.com/jobs/{job_id}"}
  

def extract_jobs(last_page):
  jobs = []
  for page in range(last_page):
    print(f"Scrapping SO :Page {page}")
    result = requests.get(f"{URL}&pg={page+1}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div",{"class":"-job"})
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  return jobs



def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return jobs

 

<save.py>

import csv

def save_to_file(jobs):
  file = open("jobs.csv", mode="w")
  writer = csv.writer(file)
  writer.writerow(["title", "company", "location", "link"])
  for job in jobs:
    writer.writerow(list(job.values()))
  return