feat: add LinkedIn scraper

pull/12/head
Cullen Watson 2023-07-08 09:34:55 -05:00
parent 97bb82c089
commit fe1f8bc91e
7 changed files with 152 additions and 25 deletions

View File

@ -1,12 +1,13 @@
import json import json
import re import re
import math from math import ceil
import tls_client import tls_client
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site
from ...jobs import * from api.core.jobs import *
from api.core.utils import handle_response
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
@ -28,14 +29,15 @@ class IndeedScraper(Scraper):
} }
response = session.get(self.url, params=params) response = session.get(self.url, params=params)
if response.status_code != 200: success, result = handle_response(response)
return {"message": f"Error - Status Code: {response.status_code}"} if not success:
return result
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
jobs = IndeedScraper.parse_jobs(soup) jobs = IndeedScraper.parse_jobs(soup)
total_num_jobs = IndeedScraper.total_jobs(soup) total_num_jobs = IndeedScraper.total_jobs(soup)
total_pages = math.ceil(total_num_jobs / 15) total_pages = ceil(total_num_jobs / 15)
job_list: list[JobPost] = [] job_list: list[JobPost] = []
# page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] # page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"]
@ -75,7 +77,6 @@ class IndeedScraper(Scraper):
title=job["normTitle"], title=job["normTitle"],
description=first_li.text if first_li else None, description=first_li.text if first_li else None,
company_name=job["company"], company_name=job["company"],
industry=None,
location=Location( location=Location(
city=job["jobLocationCity"], city=job["jobLocationCity"],
state=job["jobLocationState"], state=job["jobLocationState"],

View File

@ -0,0 +1,95 @@
from math import ceil
import requests
from bs4 import BeautifulSoup
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import *
from api.core.utils import handle_response
class LinkedInScraper(Scraper):
def __init__(self):
site = Site(Site.LINKEDIN)
super().__init__(site)
self.url = "https://www.linkedin.com/jobs"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
params = {"pageNum": scraper_input.page - 1, "location": scraper_input.location}
self.url = f"{self.url}/{scraper_input.search_term}-jobs"
response = requests.get(self.url, params=params)
success, result = handle_response(response)
if not success:
return result
soup = BeautifulSoup(response.text, "html.parser")
job_list: list[JobPost] = []
for job_card in soup.find_all(
"div",
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
):
job_url_tag = job_card.find("a", class_="base-card__full-link")
job_url = job_url_tag["href"] if job_url_tag else "N/A"
job_info = job_card.find("div", class_="base-search-card__info")
if job_info is not None:
title_tag = job_info.find("h3", class_="base-search-card__title")
title = title_tag.text.strip() if title_tag else "N/A"
company_tag = job_info.find("a", class_="hidden-nested-link")
company = company_tag.text.strip() if company_tag else "N/A"
metadata_card = job_info.find(
"div", class_="base-search-card__metadata"
)
location: Location = LinkedInScraper.get_location(metadata_card)
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate"
)
if datetime_tag:
datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
job_post = JobPost(
title=title,
company_name=company,
location=location,
date_posted=date_posted,
delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
)
job_list.append(job_post)
job_count_text = soup.find(
"span", class_="results-context-header__job-count"
).text
job_count = int("".join(filter(str.isdigit, job_count_text)))
total_pages = ceil(job_count / 25)
job_response = JobResponse(
jobs=job_list,
job_count=job_count,
page=scraper_input.page,
total_pages=total_pages,
)
return job_response
@staticmethod
def get_location(metadata_card):
if metadata_card is not None:
location_tag = metadata_card.find(
"span", class_="job-search-card__location"
)
location_string = location_tag.text.strip() if location_tag else "N/A"
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
location = Location(
country="US",
city=city,
state=state,
)
return location

View File

@ -4,13 +4,14 @@ from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site from api.core.scrapers import Scraper, ScraperInput, Site
from ...jobs import * from api.core.jobs import *
from api.core.utils import handle_response
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
def __init__(self): def __init__(self):
site = Site(Site.INDEED) site = Site(Site.ZIP_RECRUITER)
super().__init__(site) super().__init__(site)
self.url = "https://www.ziprecruiter.com/jobs-search" self.url = "https://www.ziprecruiter.com/jobs-search"
@ -23,14 +24,15 @@ class ZipRecruiterScraper(Scraper):
params = { params = {
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"page": min(scraper_input.page if scraper_input.page else 1, 10), "page": min(scraper_input.page, 10),
} }
response = session.get( response = session.get(
self.url, headers=ZipRecruiterScraper.headers(), params=params self.url, headers=ZipRecruiterScraper.headers(), params=params
) )
if response.status_code != 200: success, result = handle_response(response)
return {"message": f"Error - Status Code: {response.status_code}"} if not success:
return result
html_string = response.content html_string = response.content
soup = BeautifulSoup(html_string, "html.parser") soup = BeautifulSoup(html_string, "html.parser")
@ -57,7 +59,6 @@ class ZipRecruiterScraper(Scraper):
title=title, title=title,
description=description, description=description,
company_name=company, company_name=company,
industry=None,
location=ZipRecruiterScraper.get_location(job), location=ZipRecruiterScraper.get_location(job),
job_type=job_type, job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job), compensation=ZipRecruiterScraper.get_compensation(job),

20
api/core/utils.py Normal file
View File

@ -0,0 +1,20 @@
def handle_response(response):
if response.status_code == 200:
try:
return True, response.json()
except ValueError:
return True, response.text
try:
error_msg = response.json().get("message", "No detailed message provided.")
except ValueError:
error_msg = "No detailed message provided."
error = {
"message": "An error occurred during the request.",
"status_code": response.status_code,
"url": response.url,
"details": error_msg,
}
return False, error

View File

@ -2,8 +2,8 @@ from fastapi import APIRouter
from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.indeed import IndeedScraper
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
from api.core.scrapers.linkedin import LinkedInScraper
from api.core.scrapers import ScraperInput from api.core.scrapers import ScraperInput
from api.core.jobs import JobResponse
router = APIRouter(prefix="/jobs") router = APIRouter(prefix="/jobs")
@ -13,17 +13,19 @@ async def scrape_jobs(
site_type: str, search_term: str, location: str, page: int = None site_type: str, search_term: str, location: str, page: int = None
): ):
job_response = {"message": "site type not found"} job_response = {"message": "site type not found"}
if site_type == "indeed":
indeed_scraper = IndeedScraper() scraper_dict = {
"indeed": IndeedScraper,
"linkedin": LinkedInScraper,
"zip": ZipRecruiterScraper,
}
scraper_class = scraper_dict.get(site_type)
if scraper_class:
scraper = scraper_class()
scraper_input = ScraperInput( scraper_input = ScraperInput(
search_term=search_term, location=location, page=page search_term=search_term, location=location, page=page
) )
job_response = indeed_scraper.scrape(scraper_input) job_response = scraper.scrape(scraper_input)
elif site_type == "zip":
ziprecruiter_scraper = ZipRecruiterScraper()
scraper_input = ScraperInput(
search_term=search_term, location=location, page=page
)
job_response = ziprecruiter_scraper.scrape(scraper_input)
return job_response return job_response

View File

@ -1,4 +1,5 @@
from fastapi import FastAPI from fastapi import FastAPI
from api import router as api_router from api import router as api_router
app = FastAPI() app = FastAPI()
@ -7,4 +8,4 @@ app.include_router(api_router)
@app.get("/") @app.get("/")
async def root(): async def root():
return {"message": "Hello World"} return {"message": "JobSpy Backend"}

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
fastapi~=0.99.1
pydantic~=1.10.11
beautifulsoup4~=4.12.2
requests~=2.31.0
pip~=21.3.1
wheel~=0.37.1
setuptools~=60.2.0