minor fix

Add company industry and job level to linkedin scraper (#166 )
enh: listing source (#168 )
2026-03-05 12:04:33 -08:00 · 2024-07-15 21:19:01 -05:00 · 2024-07-15 21:07:39 -05:00 · 2024-07-15 20:30:04 -05:00 · 2024-07-15 20:25:18 -05:00
8 changed files with 86 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ jobs = scrape_jobs(
    hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
    country_indeed='USA',  # only needed for indeed / glassdoor
-    # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
+    # linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
    # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
 )
@@ -150,10 +150,15 @@ JobPost
 ├── emails (str)
 └── is_remote (bool)
 Linkedin specific
 └── job_level (str)
 Linkedin & Indeed specific
 └── company_industry (str)
 Indeed specific
 ├── company_country (str)
 └── company_addresses (str)
 └── company_industry (str)
 └── company_employees_label (str)
 └── company_revenue_label (str)
 └── company_description (str)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.57"
+version = "1.1.58"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@@ -208,13 +208,15 @@ def scrape_jobs(
            "max_amount",
            "currency",
            "is_remote",
            "job_level",
            "job_function",
            "company_industry",
            "listing_type",
            "emails",
            "description",
            "company_url",
            "company_url_direct",
            "company_addresses",
            "company_industry",
            "company_num_employees",
            "company_revenue",
            "company_description",
--- a/src/jobspy/jobs/init.py
+++ b/src/jobspy/jobs/init.py
@@ -242,10 +242,16 @@ class JobPost(BaseModel):
    date_posted: date | None = None
    emails: list[str] | None = None
    is_remote: bool | None = None
    listing_type: str | None = None
    # linkedin specific
    job_level: str | None = None
    # linkedin and indeed specific
    company_industry: str | None = None
    # indeed specific
    company_addresses: str | None = None
    company_industry: str | None = None
    company_num_employees: str | None = None
    company_revenue: str | None = None
    company_description: str | None = None
--- a/src/jobspy/scrapers/glassdoor/init.py
+++ b/src/jobspy/scrapers/glassdoor/init.py
@@ -189,6 +189,15 @@ class GlassdoorScraper(Scraper):
        except:
            description = None
        company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
        company_logo = (
            job_data["jobview"].get("overview", {}).get("squareLogoUrl", None)
        )
        listing_type = (
            job_data["jobview"]
            .get("header", {})
            .get("adOrderSponsorshipLevel", "")
            .lower()
        )
        return JobPost(
            id=str(job_id),
            title=title,
@@ -201,6 +210,8 @@ class GlassdoorScraper(Scraper):
            is_remote=is_remote,
            description=description,
            emails=extract_emails_from_text(description) if description else None,
            logo_photo_url=company_logo,
            listing_type=listing_type,
        )
    def _fetch_job_description(self, job_id):
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@@ -176,7 +176,7 @@ class IndeedScraper(Scraper):
                keys.append("DSQF7")
            if keys:
-                keys_str = '", "'.join(keys)  # Prepare your keys string
+                keys_str = '", "'.join(keys)
                filters_str = f"""
                filters: {{
                  composite: {{
@@ -244,6 +244,7 @@ class IndeedScraper(Scraper):
                .replace("Iv1", "")
                .replace("_", " ")
                .title()
                .strip()
                if employer_details.get("industry")
                else None
            ),
@@ -353,7 +354,6 @@ class IndeedScraper(Scraper):
          jobSearch(
            {what}
            {location}
            includeSponsoredResults: NONE
            limit: 100
            sort: DATE
            {cursor}
@@ -365,6 +365,9 @@ class IndeedScraper(Scraper):
            results {{
              trackingKey
              job {{
                source {{
                  name
                }}
                key
                title
                datePublished
--- a/src/jobspy/scrapers/linkedin/init.py
+++ b/src/jobspy/scrapers/linkedin/init.py
@@ -219,6 +219,8 @@ class LinkedInScraper(Scraper):
            job_url=f"{self.base_url}/jobs/view/{job_id}",
            compensation=compensation,
            job_type=job_details.get("job_type"),
            job_level=job_details.get("job_level", "").lower(),
            company_industry=job_details.get("company_industry"),
            description=job_details.get("description"),
            job_url_direct=job_details.get("job_url_direct"),
            emails=extract_emails_from_text(job_details.get("description")),
@@ -266,6 +268,8 @@ class LinkedInScraper(Scraper):
                job_function = job_function_span.text.strip()
        return {
            "description": description,
            "job_level": self._parse_job_level(soup),
            "company_industry": self._parse_company_industry(soup),
            "job_type": self._parse_job_type(soup),
            "job_url_direct": self._parse_job_url_direct(soup),
            "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
@@ -325,6 +329,52 @@ class LinkedInScraper(Scraper):
        return [get_enum_from_job_type(employment_type)] if employment_type else []
    @staticmethod
    def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
        """
        Gets the job level from job page
        :param soup_job_level:
        :return: str
        """
        h3_tag = soup_job_level.find(
            "h3",
            class_="description__job-criteria-subheader",
            string=lambda text: "Seniority level" in text,
        )
        job_level = None
        if h3_tag:
            job_level_span = h3_tag.find_next_sibling(
                "span",
                class_="description__job-criteria-text description__job-criteria-text--criteria",
            )
            if job_level_span:
                job_level = job_level_span.get_text(strip=True)
        return job_level
    @staticmethod
    def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
        """
        Gets the company industry from job page
        :param soup_industry:
        :return: str
        """
        h3_tag = soup_industry.find(
            "h3",
            class_="description__job-criteria-subheader",
            string=lambda text: "Industries" in text,
        )
        industry = None
        if h3_tag:
            industry_span = h3_tag.find_next_sibling(
                "span",
                class_="description__job-criteria-text description__job-criteria-text--criteria",
            )
            if industry_span:
                industry = industry_span.get_text(strip=True)
        return industry
    def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
        """
        Gets the job url direct from job page
--- a/src/jobspy/scrapers/ziprecruiter/init.py
+++ b/src/jobspy/scrapers/ziprecruiter/init.py
@@ -135,6 +135,7 @@ class ZipRecruiterScraper(Scraper):
        self.seen_urls.add(job_url)
        description = job.get("job_description", "").strip()
        listing_type = job.get("buyer_type", "")
        description = (
            markdown_converter(description)
            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
@@ -175,6 +176,7 @@ class ZipRecruiterScraper(Scraper):
            description=description_full if description_full else description,
            emails=extract_emails_from_text(description) if description else None,
            job_url_direct=job_url_direct,
            listing_type=listing_type,
        )
    def _get_descr(self, job_url):
Author	SHA1	Message	Date
Cullen Watson	6330c14879	minor fix	2024-07-15 21:19:01 -05:00
Ali Bakhshi Ilani	48631ea271	Add company industry and job level to linkedin scraper (#166 )	2024-07-15 21:07:39 -05:00
Cullen Watson	edffe18e65	enh: listing source (#168 )	2024-07-15 20:30:04 -05:00
Lluís Salord Quetglas	0988230a24	FEAT: Add Glassdoor logo data if available (#167 )	2024-07-15 20:25:18 -05:00