fix(linkedin): fetch full description

pull/14/head
Cullen Watson 2023-08-26 07:07:29 -05:00
parent b4b836ff71
commit fe77c2a1f3
3 changed files with 34 additions and 3 deletions

View File

@ -110,7 +110,11 @@ class IndeedScraper(Scraper):
job_type = IndeedScraper.get_job_type(job) job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000 timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
description = self.get_description(job_url, session) description = self.get_description(job_url, session)
li_elements = snippet_html.find_all("li")
if description is None and li_elements:
description = " ".join(li.text for li in li_elements)
first_li = snippet_html.find("li") first_li = snippet_html.find("li")
job_post = JobPost( job_post = JobPost(
@ -205,6 +209,9 @@ class IndeedScraper(Scraper):
response = session.get(formatted_url, allow_redirects=True) response = session.get(formatted_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][ raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel" "jobInfoModel"
]["sanitizedJobDescription"] ]["sanitizedJobDescription"]

View File

@ -54,7 +54,6 @@ class LinkedInScraper(Scraper):
} }
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
print(params)
response = session.get( response = session.get(
f"{self.url}/jobs/search", params=params, allow_redirects=True f"{self.url}/jobs/search", params=params, allow_redirects=True
) )
@ -103,6 +102,7 @@ class LinkedInScraper(Scraper):
datetime_tag = metadata_card.find( datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate" "time", class_="job-search-card__listdate"
) )
description = LinkedInScraper.get_description(job_url)
if datetime_tag: if datetime_tag:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
@ -111,6 +111,7 @@ class LinkedInScraper(Scraper):
job_post = JobPost( job_post = JobPost(
title=title, title=title,
description=description,
company_name=company, company_name=company,
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
@ -138,6 +139,27 @@ class LinkedInScraper(Scraper):
) )
return job_response return job_response
@staticmethod
def get_description(job_page_url: str) -> Optional[str]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description or None
"""
response = requests.get(job_page_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
text_content = None
if div_content:
text_content = " ".join(div_content.get_text().split()).strip()
return text_content
@staticmethod @staticmethod
def get_location(metadata_card: Optional[Tag]) -> Location: def get_location(metadata_card: Optional[Tag]) -> Location:
""" """

View File

@ -182,9 +182,9 @@ class ZipRecruiterScraper(Scraper):
) )
return job_response return job_response
@classmethod @staticmethod
def get_description( def get_description(
cls, job_page_url: str, session: tls_client.Session job_page_url: str, session: tls_client.Session
) -> Tuple[Optional[str], str]: ) -> Tuple[Optional[str], str]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
@ -195,6 +195,8 @@ class ZipRecruiterScraper(Scraper):
response = session.get( response = session.get(
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
) )
if response.status_code not in range(200, 400):
return None
html_string = response.content html_string = response.content
soup_job = BeautifulSoup(html_string, "html.parser") soup_job = BeautifulSoup(html_string, "html.parser")