fix(indeed): fetch full description

pull/14/head
Cullen Watson 2023-08-26 05:55:59 -05:00
parent eb728a572a
commit b4b836ff71
4 changed files with 48 additions and 25 deletions

View File

@ -26,8 +26,9 @@ class ScraperInput(BaseModel):
class Scraper: class Scraper:
def __init__(self, site: Site): def __init__(self, site: Site, url: str):
self.site = site self.site = site
self.url = url
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
... ...

View File

@ -3,6 +3,7 @@ import json
from typing import Optional, Tuple, List from typing import Optional, Tuple, List
import tls_client import tls_client
import urllib.parse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from fastapi import status from fastapi import status
@ -25,9 +26,8 @@ class IndeedScraper(Scraper):
Initializes IndeedScraper with the Indeed job search url Initializes IndeedScraper with the Indeed job search url
""" """
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site) url = "https://www.indeed.com"
self.url = "https://www.indeed.com/jobs" super().__init__(site, url)
self.job_url = "https://www.indeed.com/viewjob?jk="
self.jobs_per_page = 15 self.jobs_per_page = 15
self.seen_urls = set() self.seen_urls = set()
@ -60,7 +60,7 @@ class IndeedScraper(Scraper):
if sc_values: if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";" params["sc"] = "0kf:" + "".join(sc_values) + ";"
response = session.get(self.url, params=params) response = session.get(self.url + "/jobs", params=params)
if ( if (
response.status_code != status.HTTP_200_OK response.status_code != status.HTTP_200_OK
@ -82,10 +82,10 @@ class IndeedScraper(Scraper):
): ):
raise Exception("No jobs found.") raise Exception("No jobs found.")
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: def process_job(job) -> Optional[JobPost]:
job_url = f'{self.job_url}{job["jobkey"]}' job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
continue return None
snippet_html = BeautifulSoup(job["snippet"], "html.parser") snippet_html = BeautifulSoup(job["snippet"], "html.parser")
@ -110,11 +110,8 @@ class IndeedScraper(Scraper):
job_type = IndeedScraper.get_job_type(job) job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000 timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
li_elements = snippet_html.find_all("li") description = self.get_description(job_url, session)
if li_elements:
description = " ".join(li.text for li in li_elements)
else:
description = None
first_li = snippet_html.find("li") first_li = snippet_html.find("li")
job_post = JobPost( job_post = JobPost(
title=job["normTitle"], title=job["normTitle"],
@ -131,6 +128,10 @@ class IndeedScraper(Scraper):
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
) )
return job_post
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
job_post = process_job(job)
job_list.append(job_post) job_list.append(job_post)
return job_list, total_num_jobs return job_list, total_num_jobs
@ -190,6 +191,27 @@ class IndeedScraper(Scraper):
) )
return job_response return job_response
def get_description(self, job_page_url: str, session: tls_client.Session) -> str:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:param session:
:return: description
"""
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
response = session.get(formatted_url, allow_redirects=True)
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
soup = BeautifulSoup(raw_description, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
return text_content
@staticmethod @staticmethod
def get_job_type(job: dict) -> Optional[JobType]: def get_job_type(job: dict) -> Optional[JobType]:
""" """

View File

@ -15,10 +15,8 @@ class LinkedInScraper(Scraper):
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
site = Site(Site.LINKEDIN) site = Site(Site.LINKEDIN)
super().__init__(site) url = "https://www.linkedin.com"
super().__init__(site, url)
self.url = "https://www.linkedin.com/jobs/search/"
self.job_url = "https://www.linkedin.com/jobs/view/"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -57,7 +55,9 @@ class LinkedInScraper(Scraper):
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
print(params) print(params)
response = session.get(self.url, params=params, allow_redirects=True) response = session.get(
f"{self.url}/jobs/search", params=params, allow_redirects=True
)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
return JobResponse( return JobResponse(
@ -82,7 +82,7 @@ class LinkedInScraper(Scraper):
job_id = ( job_id = (
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A" data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
) )
job_url = f"{self.job_url}{job_id}" job_url = f"{self.url}/jobs/view/{job_id}"
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)

View File

@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
from fastapi import status from fastapi import status
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from api.core.jobs import JobPost from api.core.jobs import JobPost
@ -14,14 +15,13 @@ import math
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
url = "https://www.ziprecruiter.com"
def __init__(self): def __init__(self):
""" """
Initializes LinkedInScraper with the ZipRecruiter job search url Initializes LinkedInScraper with the ZipRecruiter job search url
""" """
site = Site(Site.ZIP_RECRUITER) site = Site(Site.ZIP_RECRUITER)
super().__init__(site) url = "https://www.ziprecruiter.com"
super().__init__(site, url)
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
@ -84,11 +84,11 @@ class ZipRecruiterScraper(Scraper):
job_posts = soup.find_all("div", {"class": "job_content"}) job_posts = soup.find_all("div", {"class": "job_content"})
def process_job(job: Tag) -> Optional[JobPost]: def process_job(job: Tag) -> Optional[JobPost]:
''' """
Parses a job from the job content tag Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post :param job: BeautifulSoup Tag for one job post
:return JobPost :return JobPost
''' """
job_url = job.find("a", {"class": "job_link"})["href"] job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
@ -201,7 +201,7 @@ class ZipRecruiterScraper(Scraper):
job_description_div = soup_job.find("div", {"class": "job_description"}) job_description_div = soup_job.find("div", {"class": "job_description"})
if job_description_div: if job_description_div:
return job_description_div.text.strip("\n"), response.url return job_description_div.text.strip(), response.url
return None, response.url return None, response.url
@staticmethod @staticmethod