chore: id added for JobPost schema (#152)

pull/154/head
fasih hussain 2024-05-20 21:45:52 +05:00 committed by GitHub
parent 1ffdb1756f
commit 08d63a87a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 16 additions and 0 deletions

View File

@ -168,6 +168,7 @@ def scrape_jobs(
# Desired column order # Desired column order
desired_order = [ desired_order = [
"id",
"site", "site",
"job_url_hyper" if hyperlinks else "job_url", "job_url_hyper" if hyperlinks else "job_url",
"job_url_direct", "job_url_direct",

View File

@ -226,6 +226,7 @@ class DescriptionFormat(Enum):
class JobPost(BaseModel): class JobPost(BaseModel):
id: str | None = None
title: str title: str
company_name: str | None company_name: str | None
job_url: str job_url: str

View File

@ -190,6 +190,7 @@ class GlassdoorScraper(Scraper):
description = None description = None
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm" company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
return JobPost( return JobPost(
id=str(job_id),
title=title, title=title,
company_url=company_url if company_id else None, company_url=company_url if company_id else None,
company_name=company_name, company_name=company_name,

View File

@ -213,6 +213,7 @@ class IndeedScraper(Scraper):
employer_details = employer.get("employerDetails", {}) if employer else {} employer_details = employer.get("employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
return JobPost( return JobPost(
id=str(job["key"]),
title=job["title"], title=job["title"],
description=description, description=description,
company_name=job["employer"].get("name") if job.get("employer") else None, company_name=job["employer"].get("name") if job.get("employer") else None,

View File

@ -209,6 +209,7 @@ class LinkedInScraper(Scraper):
job_details = self._get_job_details(job_url) job_details = self._get_job_details(job_url)
return JobPost( return JobPost(
id=self._get_id(job_url),
title=title, title=title,
company_name=company, company_name=company,
company_url=company_url, company_url=company_url,
@ -223,6 +224,16 @@ class LinkedInScraper(Scraper):
logo_photo_url=job_details.get("logo_photo_url"), logo_photo_url=job_details.get("logo_photo_url"),
) )
def _get_id(self, url: str):
"""
Extracts the job id from the job url
:param url:
:return: str
"""
if not url:
return None
return url.split("/")[-1]
def _get_job_details(self, job_page_url: str) -> dict: def _get_job_details(self, job_page_url: str) -> dict:
""" """
Retrieves job description and other job details by going to the job page url Retrieves job description and other job details by going to the job page url

View File

@ -151,6 +151,7 @@ class ZipRecruiterScraper(Scraper):
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
comp_currency = job.get("compensation_currency") comp_currency = job.get("compensation_currency")
return JobPost( return JobPost(
id=str(job['listing_key']),
title=title, title=title,
company_name=company, company_name=company,
location=location, location=location,