fix(linkedin): fetch full description

2026-03-05 12:04:33 -08:00 · 2023-08-26 07:07:29 -05:00
parent b4b836ff71
commit fe77c2a1f3
3 changed files with 34 additions and 3 deletions
--- a/api/core/scrapers/linkedin/init.py
+++ b/api/core/scrapers/linkedin/init.py
@@ -54,7 +54,6 @@ class LinkedInScraper(Scraper):
                }

                params = {k: v for k, v in params.items() if v is not None}
-                print(params)
                response = session.get(
                    f"{self.url}/jobs/search", params=params, allow_redirects=True
                )
@@ -103,6 +102,7 @@ class LinkedInScraper(Scraper):
                    datetime_tag = metadata_card.find(
                        "time", class_="job-search-card__listdate"
                    )
+                    description = LinkedInScraper.get_description(job_url)
                    if datetime_tag:
                        datetime_str = datetime_tag["datetime"]
                        date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
@@ -111,6 +111,7 @@ class LinkedInScraper(Scraper):

                    job_post = JobPost(
                        title=title,
+                        description=description,
                        company_name=company,
                        location=location,
                        date_posted=date_posted,
@@ -138,6 +139,27 @@ class LinkedInScraper(Scraper):
        )
        return job_response

+    @staticmethod
+    def get_description(job_page_url: str) -> Optional[str]:
+        """
+        Retrieves job description by going to the job page url
+        :param job_page_url:
+        :return: description or None
+        """
+        response = requests.get(job_page_url, allow_redirects=True)
+        if response.status_code not in range(200, 400):
+            return None
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        div_content = soup.find(
+            "div", class_=lambda x: x and "show-more-less-html__markup" in x
+        )
+
+        text_content = None
+        if div_content:
+            text_content = " ".join(div_content.get_text().split()).strip()
+        return text_content
+
    @staticmethod
    def get_location(metadata_card: Optional[Tag]) -> Location:
        """