fix(linkedin): fetch full description

2023-08-26 07:07:29 -05:00 · 2023-08-26 07:07:29 -05:00 · fe77c2a1f3
parent b4b836ff71
commit fe77c2a1f3
3 changed files with 34 additions and 3 deletions
--- a/api/core/scrapers/indeed/init.py
+++ b/api/core/scrapers/indeed/init.py
@ -110,7 +110,11 @@ class IndeedScraper(Scraper):
            job_type = IndeedScraper.get_job_type(job)
            timestamp_seconds = job["pubDate"] / 1000
            date_posted = datetime.fromtimestamp(timestamp_seconds)
            description = self.get_description(job_url, session)
            li_elements = snippet_html.find_all("li")
            if description is None and li_elements:
                description = " ".join(li.text for li in li_elements)
            first_li = snippet_html.find("li")
            job_post = JobPost(
@ -205,6 +209,9 @@ class IndeedScraper(Scraper):
        response = session.get(formatted_url, allow_redirects=True)
        if response.status_code not in range(200, 400):
            return None
        raw_description = response.json()["body"]["jobInfoWrapperModel"][
            "jobInfoModel"
        ]["sanitizedJobDescription"]
--- a/api/core/scrapers/linkedin/init.py
+++ b/api/core/scrapers/linkedin/init.py
@ -54,7 +54,6 @@ class LinkedInScraper(Scraper):
                }
                params = {k: v for k, v in params.items() if v is not None}
                print(params)
                response = session.get(
                    f"{self.url}/jobs/search", params=params, allow_redirects=True
                )
@ -103,6 +102,7 @@ class LinkedInScraper(Scraper):
                    datetime_tag = metadata_card.find(
                        "time", class_="job-search-card__listdate"
                    )
                    description = LinkedInScraper.get_description(job_url)
                    if datetime_tag:
                        datetime_str = datetime_tag["datetime"]
                        date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
@ -111,6 +111,7 @@ class LinkedInScraper(Scraper):
                    job_post = JobPost(
                        title=title,
                        description=description,
                        company_name=company,
                        location=location,
                        date_posted=date_posted,
@ -138,6 +139,27 @@ class LinkedInScraper(Scraper):
        )
        return job_response
    @staticmethod
    def get_description(job_page_url: str) -> Optional[str]:
        """
        Retrieves job description by going to the job page url
        :param job_page_url:
        :return: description or None
        """
        response = requests.get(job_page_url, allow_redirects=True)
        if response.status_code not in range(200, 400):
            return None
        soup = BeautifulSoup(response.text, "html.parser")
        div_content = soup.find(
            "div", class_=lambda x: x and "show-more-less-html__markup" in x
        )
        text_content = None
        if div_content:
            text_content = " ".join(div_content.get_text().split()).strip()
        return text_content
    @staticmethod
    def get_location(metadata_card: Optional[Tag]) -> Location:
        """
--- a/api/core/scrapers/ziprecruiter/init.py
+++ b/api/core/scrapers/ziprecruiter/init.py
@ -182,9 +182,9 @@ class ZipRecruiterScraper(Scraper):
        )
        return job_response
-    @classmethod
+    @staticmethod
    def get_description(
-        cls, job_page_url: str, session: tls_client.Session
+        job_page_url: str, session: tls_client.Session
    ) -> Tuple[Optional[str], str]:
        """
        Retrieves job description by going to the job page url
@ -195,6 +195,8 @@ class ZipRecruiterScraper(Scraper):
        response = session.get(
            job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
        )
        if response.status_code not in range(200, 400):
            return None
        html_string = response.content
        soup_job = BeautifulSoup(html_string, "html.parser")