mirror of https://github.com/Bunsly/JobSpy
fix(linkedin): fetch full description
parent
b4b836ff71
commit
fe77c2a1f3
|
@ -110,7 +110,11 @@ class IndeedScraper(Scraper):
|
||||||
job_type = IndeedScraper.get_job_type(job)
|
job_type = IndeedScraper.get_job_type(job)
|
||||||
timestamp_seconds = job["pubDate"] / 1000
|
timestamp_seconds = job["pubDate"] / 1000
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||||
|
|
||||||
description = self.get_description(job_url, session)
|
description = self.get_description(job_url, session)
|
||||||
|
li_elements = snippet_html.find_all("li")
|
||||||
|
if description is None and li_elements:
|
||||||
|
description = " ".join(li.text for li in li_elements)
|
||||||
|
|
||||||
first_li = snippet_html.find("li")
|
first_li = snippet_html.find("li")
|
||||||
job_post = JobPost(
|
job_post = JobPost(
|
||||||
|
@ -205,6 +209,9 @@ class IndeedScraper(Scraper):
|
||||||
|
|
||||||
response = session.get(formatted_url, allow_redirects=True)
|
response = session.get(formatted_url, allow_redirects=True)
|
||||||
|
|
||||||
|
if response.status_code not in range(200, 400):
|
||||||
|
return None
|
||||||
|
|
||||||
raw_description = response.json()["body"]["jobInfoWrapperModel"][
|
raw_description = response.json()["body"]["jobInfoWrapperModel"][
|
||||||
"jobInfoModel"
|
"jobInfoModel"
|
||||||
]["sanitizedJobDescription"]
|
]["sanitizedJobDescription"]
|
||||||
|
|
|
@ -54,7 +54,6 @@ class LinkedInScraper(Scraper):
|
||||||
}
|
}
|
||||||
|
|
||||||
params = {k: v for k, v in params.items() if v is not None}
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
print(params)
|
|
||||||
response = session.get(
|
response = session.get(
|
||||||
f"{self.url}/jobs/search", params=params, allow_redirects=True
|
f"{self.url}/jobs/search", params=params, allow_redirects=True
|
||||||
)
|
)
|
||||||
|
@ -103,6 +102,7 @@ class LinkedInScraper(Scraper):
|
||||||
datetime_tag = metadata_card.find(
|
datetime_tag = metadata_card.find(
|
||||||
"time", class_="job-search-card__listdate"
|
"time", class_="job-search-card__listdate"
|
||||||
)
|
)
|
||||||
|
description = LinkedInScraper.get_description(job_url)
|
||||||
if datetime_tag:
|
if datetime_tag:
|
||||||
datetime_str = datetime_tag["datetime"]
|
datetime_str = datetime_tag["datetime"]
|
||||||
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
|
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
|
||||||
|
@ -111,6 +111,7 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
job_post = JobPost(
|
job_post = JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
|
description=description,
|
||||||
company_name=company,
|
company_name=company,
|
||||||
location=location,
|
location=location,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
|
@ -138,6 +139,27 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
return job_response
|
return job_response
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_description(job_page_url: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Retrieves job description by going to the job page url
|
||||||
|
:param job_page_url:
|
||||||
|
:return: description or None
|
||||||
|
"""
|
||||||
|
response = requests.get(job_page_url, allow_redirects=True)
|
||||||
|
if response.status_code not in range(200, 400):
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
div_content = soup.find(
|
||||||
|
"div", class_=lambda x: x and "show-more-less-html__markup" in x
|
||||||
|
)
|
||||||
|
|
||||||
|
text_content = None
|
||||||
|
if div_content:
|
||||||
|
text_content = " ".join(div_content.get_text().split()).strip()
|
||||||
|
return text_content
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_location(metadata_card: Optional[Tag]) -> Location:
|
def get_location(metadata_card: Optional[Tag]) -> Location:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -182,9 +182,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
)
|
)
|
||||||
return job_response
|
return job_response
|
||||||
|
|
||||||
@classmethod
|
@staticmethod
|
||||||
def get_description(
|
def get_description(
|
||||||
cls, job_page_url: str, session: tls_client.Session
|
job_page_url: str, session: tls_client.Session
|
||||||
) -> Tuple[Optional[str], str]:
|
) -> Tuple[Optional[str], str]:
|
||||||
"""
|
"""
|
||||||
Retrieves job description by going to the job page url
|
Retrieves job description by going to the job page url
|
||||||
|
@ -195,6 +195,8 @@ class ZipRecruiterScraper(Scraper):
|
||||||
response = session.get(
|
response = session.get(
|
||||||
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
|
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
|
||||||
)
|
)
|
||||||
|
if response.status_code not in range(200, 400):
|
||||||
|
return None
|
||||||
|
|
||||||
html_string = response.content
|
html_string = response.content
|
||||||
soup_job = BeautifulSoup(html_string, "html.parser")
|
soup_job = BeautifulSoup(html_string, "html.parser")
|
||||||
|
|
Loading…
Reference in New Issue