From 08d63a87a25dab84ff2a50d51f3f6f7f31285fdf Mon Sep 17 00:00:00 2001 From: fasih hussain <58245239+fasihhussain00@users.noreply.github.com> Date: Mon, 20 May 2024 21:45:52 +0500 Subject: [PATCH] chore: id added for JobPost schema (#152) --- src/jobspy/__init__.py | 1 + src/jobspy/jobs/__init__.py | 1 + src/jobspy/scrapers/glassdoor/__init__.py | 1 + src/jobspy/scrapers/indeed/__init__.py | 1 + src/jobspy/scrapers/linkedin/__init__.py | 11 +++++++++++ src/jobspy/scrapers/ziprecruiter/__init__.py | 1 + 6 files changed, 16 insertions(+) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index a87238e..a2656cb 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -168,6 +168,7 @@ def scrape_jobs( # Desired column order desired_order = [ + "id", "site", "job_url_hyper" if hyperlinks else "job_url", "job_url_direct", diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 31cbce9..61816c5 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -226,6 +226,7 @@ class DescriptionFormat(Enum): class JobPost(BaseModel): + id: str | None = None title: str company_name: str | None job_url: str diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 0d85aa6..89f5a95 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -190,6 +190,7 @@ class GlassdoorScraper(Scraper): description = None company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm" return JobPost( + id=str(job_id), title=title, company_url=company_url if company_id else None, company_name=company_name, diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index ff9985d..58303f5 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -213,6 +213,7 @@ class IndeedScraper(Scraper): employer_details = employer.get("employerDetails", {}) if employer else {} rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None return JobPost( + id=str(job["key"]), title=job["title"], description=description, company_name=job["employer"].get("name") if job.get("employer") else None, diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 243faff..18fbb84 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -209,6 +209,7 @@ class LinkedInScraper(Scraper): job_details = self._get_job_details(job_url) return JobPost( + id=self._get_id(job_url), title=title, company_name=company, company_url=company_url, @@ -223,6 +224,16 @@ class LinkedInScraper(Scraper): logo_photo_url=job_details.get("logo_photo_url"), ) + def _get_id(self, url: str): + """ + Extracts the job id from the job url + :param url: + :return: str + """ + if not url: + return None + return url.split("/")[-1] + def _get_job_details(self, job_page_url: str) -> dict: """ Retrieves job description and other job details by going to the job page url diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 329de38..fbe896f 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -151,6 +151,7 @@ class ZipRecruiterScraper(Scraper): comp_max = int(job["compensation_max"]) if "compensation_max" in job else None comp_currency = job.get("compensation_currency") return JobPost( + id=str(job['listing_key']), title=title, company_name=company, location=location,