Merge pull request #14 from cullenwatson/fix/scrapers/fetch-full-description

Fix full description
pull/12/head
Cullen Watson 2023-08-26 07:20:15 -05:00 committed by GitHub
commit d67383f053
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 123 additions and 24 deletions

View File

@ -26,8 +26,9 @@ class ScraperInput(BaseModel):
class Scraper: class Scraper:
def __init__(self, site: Site): def __init__(self, site: Site, url: str):
self.site = site self.site = site
self.url = url
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
... ...

View File

@ -3,6 +3,7 @@ import json
from typing import Optional, Tuple, List from typing import Optional, Tuple, List
import tls_client import tls_client
import urllib.parse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from fastapi import status from fastapi import status
@ -25,9 +26,8 @@ class IndeedScraper(Scraper):
Initializes IndeedScraper with the Indeed job search url Initializes IndeedScraper with the Indeed job search url
""" """
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site) url = "https://www.indeed.com"
self.url = "https://www.indeed.com/jobs" super().__init__(site, url)
self.job_url = "https://www.indeed.com/viewjob?jk="
self.jobs_per_page = 15 self.jobs_per_page = 15
self.seen_urls = set() self.seen_urls = set()
@ -60,7 +60,7 @@ class IndeedScraper(Scraper):
if sc_values: if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";" params["sc"] = "0kf:" + "".join(sc_values) + ";"
response = session.get(self.url, params=params) response = session.get(self.url + "/jobs", params=params)
if ( if (
response.status_code != status.HTTP_200_OK response.status_code != status.HTTP_200_OK
@ -82,10 +82,10 @@ class IndeedScraper(Scraper):
): ):
raise Exception("No jobs found.") raise Exception("No jobs found.")
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: def process_job(job) -> Optional[JobPost]:
job_url = f'{self.job_url}{job["jobkey"]}' job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
continue return None
snippet_html = BeautifulSoup(job["snippet"], "html.parser") snippet_html = BeautifulSoup(job["snippet"], "html.parser")
@ -111,10 +111,15 @@ class IndeedScraper(Scraper):
timestamp_seconds = job["pubDate"] / 1000 timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
description = self.get_description(job_url, session)
li_elements = snippet_html.find_all("li")
if description is None and li_elements:
description = " ".join(li.text for li in li_elements)
first_li = snippet_html.find("li") first_li = snippet_html.find("li")
job_post = JobPost( job_post = JobPost(
title=job["normTitle"], title=job["normTitle"],
description=first_li.text if first_li else None, description=description,
company_name=job["company"], company_name=job["company"],
location=Location( location=Location(
city=job.get("jobLocationCity"), city=job.get("jobLocationCity"),
@ -127,6 +132,10 @@ class IndeedScraper(Scraper):
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
) )
return job_post
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
job_post = process_job(job)
job_list.append(job_post) job_list.append(job_post)
return job_list, total_num_jobs return job_list, total_num_jobs
@ -186,6 +195,30 @@ class IndeedScraper(Scraper):
) )
return job_response return job_response
def get_description(self, job_page_url: str, session: tls_client.Session) -> str:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:param session:
:return: description
"""
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
response = session.get(formatted_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
soup = BeautifulSoup(raw_description, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
return text_content
@staticmethod @staticmethod
def get_job_type(job: dict) -> Optional[JobType]: def get_job_type(job: dict) -> Optional[JobType]:
""" """

View File

@ -15,10 +15,8 @@ class LinkedInScraper(Scraper):
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
site = Site(Site.LINKEDIN) site = Site(Site.LINKEDIN)
super().__init__(site) url = "https://www.linkedin.com"
super().__init__(site, url)
self.url = "https://www.linkedin.com/jobs/search/"
self.job_url = "https://www.linkedin.com/jobs/view/"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -56,8 +54,9 @@ class LinkedInScraper(Scraper):
} }
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
print(params) response = session.get(
response = session.get(self.url, params=params, allow_redirects=True) f"{self.url}/jobs/search", params=params, allow_redirects=True
)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
return JobResponse( return JobResponse(
@ -82,7 +81,7 @@ class LinkedInScraper(Scraper):
job_id = ( job_id = (
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A" data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
) )
job_url = f"{self.job_url}{job_id}" job_url = f"{self.url}/jobs/view/{job_id}"
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
@ -103,6 +102,7 @@ class LinkedInScraper(Scraper):
datetime_tag = metadata_card.find( datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate" "time", class_="job-search-card__listdate"
) )
description = LinkedInScraper.get_description(job_url)
if datetime_tag: if datetime_tag:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
@ -111,6 +111,7 @@ class LinkedInScraper(Scraper):
job_post = JobPost( job_post = JobPost(
title=title, title=title,
description=description,
company_name=company, company_name=company,
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
@ -138,6 +139,27 @@ class LinkedInScraper(Scraper):
) )
return job_response return job_response
@staticmethod
def get_description(job_page_url: str) -> Optional[str]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description or None
"""
response = requests.get(job_page_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
text_content = None
if div_content:
text_content = " ".join(div_content.get_text().split()).strip()
return text_content
@staticmethod @staticmethod
def get_location(metadata_card: Optional[Tag]) -> Location: def get_location(metadata_card: Optional[Tag]) -> Location:
""" """

View File

@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
from fastapi import status from fastapi import status
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from api.core.jobs import JobPost from api.core.jobs import JobPost
@ -19,9 +20,9 @@ class ZipRecruiterScraper(Scraper):
Initializes LinkedInScraper with the ZipRecruiter job search url Initializes LinkedInScraper with the ZipRecruiter job search url
""" """
site = Site(Site.ZIP_RECRUITER) site = Site(Site.ZIP_RECRUITER)
super().__init__(site) url = "https://www.ziprecruiter.com"
super().__init__(site, url)
self.url = "https://www.ziprecruiter.com/jobs-search"
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
@ -61,7 +62,9 @@ class ZipRecruiterScraper(Scraper):
} }
response = session.get( response = session.get(
self.url, headers=ZipRecruiterScraper.headers(), params=params self.url + "/jobs-search",
headers=ZipRecruiterScraper.headers(),
params=params,
) )
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
@ -69,6 +72,7 @@ class ZipRecruiterScraper(Scraper):
html_string = response.content html_string = response.content
soup = BeautifulSoup(html_string, "html.parser") soup = BeautifulSoup(html_string, "html.parser")
if page == 1: if page == 1:
script_tag = soup.find("script", {"id": "js_variables"}) script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string) data = json.loads(script_tag.string)
@ -79,16 +83,24 @@ class ZipRecruiterScraper(Scraper):
job_posts = soup.find_all("div", {"class": "job_content"}) job_posts = soup.find_all("div", {"class": "job_content"})
for job in job_posts: def process_job(job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", {"class": "job_link"})["href"] job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in self.seen_urls: if job_url in self.seen_urls:
continue return None
title = job.find("h2", {"class": "title"}).text title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip() company = job.find("a", {"class": "company_name"}).text.strip()
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
description, job_url = ZipRecruiterScraper.get_description(job_url, session)
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
if job_type_element: if job_type_element:
job_type_text = ( job_type_text = (
job_type_element.text.strip() job_type_element.text.strip()
@ -114,7 +126,14 @@ class ZipRecruiterScraper(Scraper):
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
) )
job_list.append(job_post) return job_post
with ThreadPoolExecutor(max_workers=10) as executor:
job_results: list[Future] = [
executor.submit(process_job, job) for job in job_posts
]
job_list = [result.result() for result in job_results if result.result()]
return job_list, job_count return job_list, job_count
@ -163,6 +182,30 @@ class ZipRecruiterScraper(Scraper):
) )
return job_response return job_response
@staticmethod
def get_description(
job_page_url: str, session: tls_client.Session
) -> Tuple[Optional[str], str]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:param session:
:return: description or None, response url
"""
response = session.get(
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
)
if response.status_code not in range(200, 400):
return None
html_string = response.content
soup_job = BeautifulSoup(html_string, "html.parser")
job_description_div = soup_job.find("div", {"class": "job_description"})
if job_description_div:
return job_description_div.text.strip(), response.url
return None, response.url
@staticmethod @staticmethod
def get_interval(interval_str: str): def get_interval(interval_str: str):
""" """