Add company industry and job level to linkedin scraper (#166)

2026-03-04 19:44:30 -08:00 · 2024-07-16 05:37:39 +03:30
parent edffe18e65
commit 48631ea271
4 changed files with 64 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ jobs = scrape_jobs(
    hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
    country_indeed='USA',  # only needed for indeed / glassdoor
    
-    # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
+    # linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
    # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
    
 )
@@ -150,10 +150,15 @@ JobPost
 ├── emails (str)
 └── is_remote (bool)

+Linkedin specific
+└── job_level (str)
+
+Linkedin & Indeed specific
+└── company_industry (str)
+
 Indeed specific
 ├── company_country (str)
 └── company_addresses (str)
-└── company_industry (str)
 └── company_employees_label (str)
 └── company_revenue_label (str)
 └── company_description (str)
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@@ -208,6 +208,7 @@ def scrape_jobs(
            "max_amount",
            "currency",
            "is_remote",
+            "job_level",
            "job_function",
            "listing_type",
            "emails",
--- a/src/jobspy/jobs/init.py
+++ b/src/jobspy/jobs/init.py
@@ -244,9 +244,14 @@ class JobPost(BaseModel):
    is_remote: bool | None = None
    listing_type: str | None = None

+    # linkedin specific
+    job_level: str | None = None
+
+    # linkedin and indeed specific
+    company_industry: str | None = None
+
    # indeed specific
    company_addresses: str | None = None
-    company_industry: str | None = None
    company_num_employees: str | None = None
    company_revenue: str | None = None
    company_description: str | None = None
--- a/src/jobspy/scrapers/linkedin/init.py
+++ b/src/jobspy/scrapers/linkedin/init.py
@@ -219,6 +219,8 @@ class LinkedInScraper(Scraper):
            job_url=f"{self.base_url}/jobs/view/{job_id}",
            compensation=compensation,
            job_type=job_details.get("job_type"),
+            job_level=job_details.get("job_level"),
+            company_industry=job_details.get("company_industry"),
            description=job_details.get("description"),
            job_url_direct=job_details.get("job_url_direct"),
            emails=extract_emails_from_text(job_details.get("description")),
@@ -266,6 +268,8 @@ class LinkedInScraper(Scraper):
                job_function = job_function_span.text.strip()
        return {
            "description": description,
+            "job_level": self._parse_job_level(soup),
+            "company_industry": self._parse_company_industry(soup),
            "job_type": self._parse_job_type(soup),
            "job_url_direct": self._parse_job_url_direct(soup),
            "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
@@ -325,6 +329,52 @@ class LinkedInScraper(Scraper):

        return [get_enum_from_job_type(employment_type)] if employment_type else []

+    @staticmethod
+    def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
+        """
+        Gets the job level from job page
+        :param soup_job_level:
+        :return: str
+        """
+        h3_tag = soup_job_level.find(
+            "h3",
+            class_="description__job-criteria-subheader",
+            string=lambda text: "Seniority level" in text,
+        )
+        job_level = None
+        if h3_tag:
+            job_level_span = h3_tag.find_next_sibling(
+                "span",
+                class_="description__job-criteria-text description__job-criteria-text--criteria",
+            )
+            if job_level_span:
+                job_level = job_level_span.get_text(strip=True)
+
+        return job_level
+    
+    @staticmethod
+    def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
+        """
+        Gets the company industry from job page
+        :param soup_industry:
+        :return: str
+        """
+        h3_tag = soup_industry.find(
+            "h3",
+            class_="description__job-criteria-subheader",
+            string=lambda text: "Industries" in text,
+        )
+        industry = None
+        if h3_tag:
+            industry_span = h3_tag.find_next_sibling(
+                "span",
+                class_="description__job-criteria-text description__job-criteria-text--criteria",
+            )
+            if industry_span:
+                industry = industry_span.get_text(strip=True)
+
+        return industry    
+
    def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
        """
        Gets the job url direct from job page