fix(linkedin): fetch full description

pull/14/head
Cullen Watson 2023-08-26 07:07:29 -05:00
parent b4b836ff71
commit fe77c2a1f3
3 changed files with 34 additions and 3 deletions

View File

@ -110,7 +110,11 @@ class IndeedScraper(Scraper):
job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
description = self.get_description(job_url, session)
li_elements = snippet_html.find_all("li")
if description is None and li_elements:
description = " ".join(li.text for li in li_elements)
first_li = snippet_html.find("li")
job_post = JobPost(
@ -205,6 +209,9 @@ class IndeedScraper(Scraper):
response = session.get(formatted_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]

View File

@ -54,7 +54,6 @@ class LinkedInScraper(Scraper):
}
params = {k: v for k, v in params.items() if v is not None}
print(params)
response = session.get(
f"{self.url}/jobs/search", params=params, allow_redirects=True
)
@ -103,6 +102,7 @@ class LinkedInScraper(Scraper):
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate"
)
description = LinkedInScraper.get_description(job_url)
if datetime_tag:
datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
@ -111,6 +111,7 @@ class LinkedInScraper(Scraper):
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=location,
date_posted=date_posted,
@ -138,6 +139,27 @@ class LinkedInScraper(Scraper):
)
return job_response
@staticmethod
def get_description(job_page_url: str) -> Optional[str]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description or None
"""
response = requests.get(job_page_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
text_content = None
if div_content:
text_content = " ".join(div_content.get_text().split()).strip()
return text_content
@staticmethod
def get_location(metadata_card: Optional[Tag]) -> Location:
"""

View File

@ -182,9 +182,9 @@ class ZipRecruiterScraper(Scraper):
)
return job_response
@classmethod
@staticmethod
def get_description(
cls, job_page_url: str, session: tls_client.Session
job_page_url: str, session: tls_client.Session
) -> Tuple[Optional[str], str]:
"""
Retrieves job description by going to the job page url
@ -195,6 +195,8 @@ class ZipRecruiterScraper(Scraper):
response = session.get(
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
)
if response.status_code not in range(200, 400):
return None
html_string = response.content
soup_job = BeautifulSoup(html_string, "html.parser")