fix(indeed): fetch full description

pull/14/head
Cullen Watson 2023-08-26 05:55:59 -05:00
parent eb728a572a
commit b4b836ff71
4 changed files with 48 additions and 25 deletions

View File

@ -26,8 +26,9 @@ class ScraperInput(BaseModel):
class Scraper:
def __init__(self, site: Site):
def __init__(self, site: Site, url: str):
self.site = site
self.url = url
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
...

View File

@ -3,6 +3,7 @@ import json
from typing import Optional, Tuple, List
import tls_client
import urllib.parse
from bs4 import BeautifulSoup
from bs4.element import Tag
from fastapi import status
@ -25,9 +26,8 @@ class IndeedScraper(Scraper):
Initializes IndeedScraper with the Indeed job search url
"""
site = Site(Site.INDEED)
super().__init__(site)
self.url = "https://www.indeed.com/jobs"
self.job_url = "https://www.indeed.com/viewjob?jk="
url = "https://www.indeed.com"
super().__init__(site, url)
self.jobs_per_page = 15
self.seen_urls = set()
@ -60,7 +60,7 @@ class IndeedScraper(Scraper):
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
response = session.get(self.url, params=params)
response = session.get(self.url + "/jobs", params=params)
if (
response.status_code != status.HTTP_200_OK
@ -82,10 +82,10 @@ class IndeedScraper(Scraper):
):
raise Exception("No jobs found.")
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
job_url = f'{self.job_url}{job["jobkey"]}'
def process_job(job) -> Optional[JobPost]:
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls:
continue
return None
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
@ -110,11 +110,8 @@ class IndeedScraper(Scraper):
job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
li_elements = snippet_html.find_all("li")
if li_elements:
description = " ".join(li.text for li in li_elements)
else:
description = None
description = self.get_description(job_url, session)
first_li = snippet_html.find("li")
job_post = JobPost(
title=job["normTitle"],
@ -131,6 +128,10 @@ class IndeedScraper(Scraper):
date_posted=date_posted,
job_url=job_url,
)
return job_post
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
job_post = process_job(job)
job_list.append(job_post)
return job_list, total_num_jobs
@ -190,6 +191,27 @@ class IndeedScraper(Scraper):
)
return job_response
def get_description(self, job_page_url: str, session: tls_client.Session) -> str:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:param session:
:return: description
"""
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
response = session.get(formatted_url, allow_redirects=True)
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
soup = BeautifulSoup(raw_description, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
return text_content
@staticmethod
def get_job_type(job: dict) -> Optional[JobType]:
"""

View File

@ -15,10 +15,8 @@ class LinkedInScraper(Scraper):
Initializes LinkedInScraper with the LinkedIn job search url
"""
site = Site(Site.LINKEDIN)
super().__init__(site)
self.url = "https://www.linkedin.com/jobs/search/"
self.job_url = "https://www.linkedin.com/jobs/view/"
url = "https://www.linkedin.com"
super().__init__(site, url)
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@ -57,7 +55,9 @@ class LinkedInScraper(Scraper):
params = {k: v for k, v in params.items() if v is not None}
print(params)
response = session.get(self.url, params=params, allow_redirects=True)
response = session.get(
f"{self.url}/jobs/search", params=params, allow_redirects=True
)
if response.status_code != status.HTTP_200_OK:
return JobResponse(
@ -82,7 +82,7 @@ class LinkedInScraper(Scraper):
job_id = (
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
)
job_url = f"{self.job_url}{job_id}"
job_url = f"{self.url}/jobs/view/{job_id}"
if job_url in seen_urls:
continue
seen_urls.add(job_url)

View File

@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs
import tls_client
from fastapi import status
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from api.core.jobs import JobPost
@ -14,14 +15,13 @@ import math
class ZipRecruiterScraper(Scraper):
url = "https://www.ziprecruiter.com"
def __init__(self):
"""
Initializes LinkedInScraper with the ZipRecruiter job search url
"""
site = Site(Site.ZIP_RECRUITER)
super().__init__(site)
url = "https://www.ziprecruiter.com"
super().__init__(site, url)
self.jobs_per_page = 20
self.seen_urls = set()
@ -84,11 +84,11 @@ class ZipRecruiterScraper(Scraper):
job_posts = soup.find_all("div", {"class": "job_content"})
def process_job(job: Tag) -> Optional[JobPost]:
'''
"""
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
'''
"""
job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in self.seen_urls:
return None
@ -201,7 +201,7 @@ class ZipRecruiterScraper(Scraper):
job_description_div = soup_job.find("div", {"class": "job_description"})
if job_description_div:
return job_description_div.text.strip("\n"), response.url
return job_description_div.text.strip(), response.url
return None, response.url
@staticmethod