diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 4bfaa26..4826081 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -110,7 +110,11 @@ class IndeedScraper(Scraper): job_type = IndeedScraper.get_job_type(job) timestamp_seconds = job["pubDate"] / 1000 date_posted = datetime.fromtimestamp(timestamp_seconds) + description = self.get_description(job_url, session) + li_elements = snippet_html.find_all("li") + if description is None and li_elements: + description = " ".join(li.text for li in li_elements) first_li = snippet_html.find("li") job_post = JobPost( @@ -205,6 +209,9 @@ class IndeedScraper(Scraper): response = session.get(formatted_url, allow_redirects=True) + if response.status_code not in range(200, 400): + return None + raw_description = response.json()["body"]["jobInfoWrapperModel"][ "jobInfoModel" ]["sanitizedJobDescription"] diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index 7e12143..c7019ad 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -54,7 +54,6 @@ class LinkedInScraper(Scraper): } params = {k: v for k, v in params.items() if v is not None} - print(params) response = session.get( f"{self.url}/jobs/search", params=params, allow_redirects=True ) @@ -103,6 +102,7 @@ class LinkedInScraper(Scraper): datetime_tag = metadata_card.find( "time", class_="job-search-card__listdate" ) + description = LinkedInScraper.get_description(job_url) if datetime_tag: datetime_str = datetime_tag["datetime"] date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") @@ -111,6 +111,7 @@ class LinkedInScraper(Scraper): job_post = JobPost( title=title, + description=description, company_name=company, location=location, date_posted=date_posted, @@ -138,6 +139,27 @@ class LinkedInScraper(Scraper): ) return job_response + @staticmethod + def get_description(job_page_url: str) -> Optional[str]: + """ + Retrieves job description by going to the job page url + :param job_page_url: + :return: description or None + """ + response = requests.get(job_page_url, allow_redirects=True) + if response.status_code not in range(200, 400): + return None + + soup = BeautifulSoup(response.text, "html.parser") + div_content = soup.find( + "div", class_=lambda x: x and "show-more-less-html__markup" in x + ) + + text_content = None + if div_content: + text_content = " ".join(div_content.get_text().split()).strip() + return text_content + @staticmethod def get_location(metadata_card: Optional[Tag]) -> Location: """ diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 0bedc03..7180fb3 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -182,9 +182,9 @@ class ZipRecruiterScraper(Scraper): ) return job_response - @classmethod + @staticmethod def get_description( - cls, job_page_url: str, session: tls_client.Session + job_page_url: str, session: tls_client.Session ) -> Tuple[Optional[str], str]: """ Retrieves job description by going to the job page url @@ -195,6 +195,8 @@ class ZipRecruiterScraper(Scraper): response = session.get( job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True ) + if response.status_code not in range(200, 400): + return None html_string = response.content soup_job = BeautifulSoup(html_string, "html.parser")