enh: listing source (#168)

pull/169/head
Cullen Watson 2024-07-15 20:30:04 -05:00 committed by GitHub
parent 0988230a24
commit edffe18e65
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 19 additions and 4 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.57" version = "1.1.58"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -209,6 +209,7 @@ def scrape_jobs(
"currency", "currency",
"is_remote", "is_remote",
"job_function", "job_function",
"listing_type",
"emails", "emails",
"description", "description",
"company_url", "company_url",

View File

@ -242,6 +242,7 @@ class JobPost(BaseModel):
date_posted: date | None = None date_posted: date | None = None
emails: list[str] | None = None emails: list[str] | None = None
is_remote: bool | None = None is_remote: bool | None = None
listing_type: str | None = None
# indeed specific # indeed specific
company_addresses: str | None = None company_addresses: str | None = None

View File

@ -189,7 +189,15 @@ class GlassdoorScraper(Scraper):
except: except:
description = None description = None
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm" company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
company_logo = job_data["jobview"].get("overview", {}).get("squareLogoUrl", None) company_logo = (
job_data["jobview"].get("overview", {}).get("squareLogoUrl", None)
)
listing_type = (
job_data["jobview"]
.get("header", {})
.get("adOrderSponsorshipLevel", "")
.lower()
)
return JobPost( return JobPost(
id=str(job_id), id=str(job_id),
title=title, title=title,
@ -203,6 +211,7 @@ class GlassdoorScraper(Scraper):
description=description, description=description,
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
logo_photo_url=company_logo, logo_photo_url=company_logo,
listing_type=listing_type,
) )
def _fetch_job_description(self, job_id): def _fetch_job_description(self, job_id):

View File

@ -176,7 +176,7 @@ class IndeedScraper(Scraper):
keys.append("DSQF7") keys.append("DSQF7")
if keys: if keys:
keys_str = '", "'.join(keys) # Prepare your keys string keys_str = '", "'.join(keys)
filters_str = f""" filters_str = f"""
filters: {{ filters: {{
composite: {{ composite: {{
@ -353,7 +353,6 @@ class IndeedScraper(Scraper):
jobSearch( jobSearch(
{what} {what}
{location} {location}
includeSponsoredResults: NONE
limit: 100 limit: 100
sort: DATE sort: DATE
{cursor} {cursor}
@ -365,6 +364,9 @@ class IndeedScraper(Scraper):
results {{ results {{
trackingKey trackingKey
job {{ job {{
source {{
name
}}
key key
title title
datePublished datePublished

View File

@ -135,6 +135,7 @@ class ZipRecruiterScraper(Scraper):
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
description = job.get("job_description", "").strip() description = job.get("job_description", "").strip()
listing_type = job.get("buyer_type", "")
description = ( description = (
markdown_converter(description) markdown_converter(description)
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
@ -175,6 +176,7 @@ class ZipRecruiterScraper(Scraper):
description=description_full if description_full else description, description=description_full if description_full else description,
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
job_url_direct=job_url_direct, job_url_direct=job_url_direct,
listing_type=listing_type,
) )
def _get_descr(self, job_url): def _get_descr(self, job_url):