mirror of https://github.com/Bunsly/JobSpy
feat(jobs): filter by is_remote
parent
2b63d4c84f
commit
d69c41392d
|
@ -37,25 +37,16 @@ class Compensation(BaseModel):
|
|||
currency: str = "US"
|
||||
|
||||
|
||||
class DeliveryEnum(Enum):
|
||||
EMAIL = "email"
|
||||
URL = "url"
|
||||
|
||||
|
||||
class Delivery(BaseModel):
|
||||
method: DeliveryEnum
|
||||
value: str
|
||||
|
||||
|
||||
class JobPost(BaseModel):
|
||||
title: str
|
||||
company_name: str
|
||||
job_url: str
|
||||
location: Location
|
||||
|
||||
description: str = None
|
||||
job_type: JobType = None
|
||||
compensation: Compensation = None
|
||||
date_posted: datetime = None
|
||||
delivery: Delivery = None
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
|
|
|
@ -11,10 +11,11 @@ class Site(Enum):
|
|||
|
||||
class ScraperInput(BaseModel):
|
||||
site_type: Site
|
||||
|
||||
search_term: str
|
||||
location: str
|
||||
distance: int = 25
|
||||
|
||||
location: str = None
|
||||
distance: int = None
|
||||
is_remote: bool = False
|
||||
|
||||
results_wanted: int = 15
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ class IndeedScraper(Scraper):
|
|||
site = Site(Site.INDEED)
|
||||
super().__init__(site)
|
||||
self.url = "https://www.indeed.com/jobs"
|
||||
self.job_url = "https://www.indeed.com/viewjob?jk="
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
|
@ -41,14 +42,18 @@ class IndeedScraper(Scraper):
|
|||
while len(job_list) < scraper_input.results_wanted:
|
||||
params = {
|
||||
"q": scraper_input.search_term,
|
||||
"l": scraper_input.location,
|
||||
"location": scraper_input.location,
|
||||
"radius": scraper_input.distance,
|
||||
"sc": "0kf:attr(DSQF7);" if scraper_input.is_remote else None,
|
||||
"filter": 0,
|
||||
"start": 0 + page * 10,
|
||||
"radius": scraper_input.distance,
|
||||
}
|
||||
|
||||
response = session.get(self.url, params=params)
|
||||
|
||||
if response.status_code == 307:
|
||||
new_url = response.headers["Location"]
|
||||
response = session.get(new_url)
|
||||
if response.status_code != status.HTTP_200_OK:
|
||||
return JobResponse(
|
||||
success=False,
|
||||
|
@ -78,7 +83,7 @@ class IndeedScraper(Scraper):
|
|||
)
|
||||
|
||||
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
|
||||
job_url = job["thirdPartyApplyUrl"]
|
||||
job_url = f'{self.job_url}{job["jobkey"]}'
|
||||
if job_url in seen_urls:
|
||||
continue
|
||||
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
|
||||
|
@ -104,10 +109,6 @@ class IndeedScraper(Scraper):
|
|||
)
|
||||
|
||||
job_type = IndeedScraper.get_job_type(job)
|
||||
if job.get("thirdPartyApplyUrl"):
|
||||
delivery = Delivery(method=DeliveryEnum.URL, value=job_url)
|
||||
else:
|
||||
delivery = None
|
||||
timestamp_seconds = job["pubDate"] / 1000
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||
|
||||
|
@ -117,15 +118,15 @@ class IndeedScraper(Scraper):
|
|||
description=first_li.text if first_li else None,
|
||||
company_name=job["company"],
|
||||
location=Location(
|
||||
city=job["jobLocationCity"],
|
||||
state=job["jobLocationState"],
|
||||
city=job.get("jobLocationCity"),
|
||||
state=job.get("jobLocationState"),
|
||||
postal_code=job.get("jobLocationPostal"),
|
||||
country="US",
|
||||
),
|
||||
job_type=job_type,
|
||||
compensation=compensation,
|
||||
date_posted=date_posted,
|
||||
delivery=delivery,
|
||||
job_url=job_url,
|
||||
)
|
||||
job_list.append(job_post)
|
||||
if len(job_list) >= scraper_input.results_wanted:
|
||||
|
|
|
@ -17,7 +17,8 @@ class LinkedInScraper(Scraper):
|
|||
site = Site(Site.LINKEDIN)
|
||||
super().__init__(site)
|
||||
|
||||
self.url = "https://www.linkedin.com/jobs"
|
||||
self.url = "https://www.linkedin.com/jobs/search/"
|
||||
self.job_url = "https://www.linkedin.com/jobs/view/"
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
|
@ -32,12 +33,13 @@ class LinkedInScraper(Scraper):
|
|||
with requests.Session() as session:
|
||||
while len(job_list) < scraper_input.results_wanted:
|
||||
params = {
|
||||
"pageNum": page,
|
||||
"keywords": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
"distance": scraper_input.distance,
|
||||
"f_WT": 2 if scraper_input.is_remote else None,
|
||||
"pageNum": page,
|
||||
}
|
||||
|
||||
self.url = f"{self.url}/{scraper_input.search_term}-jobs"
|
||||
response = session.get(self.url, params=params, allow_redirects=True)
|
||||
|
||||
if response.status_code != status.HTTP_200_OK:
|
||||
|
@ -58,8 +60,11 @@ class LinkedInScraper(Scraper):
|
|||
"div",
|
||||
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
|
||||
):
|
||||
job_url_tag = job_card.find("a", class_="base-card__full-link")
|
||||
job_url = job_url_tag["href"] if job_url_tag else "N/A"
|
||||
data_entity_urn = job_card.get("data-entity-urn", "")
|
||||
job_id = (
|
||||
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
|
||||
)
|
||||
job_url = f"{self.job_url}{job_id}"
|
||||
if job_url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(job_url)
|
||||
|
@ -91,7 +96,7 @@ class LinkedInScraper(Scraper):
|
|||
company_name=company,
|
||||
location=location,
|
||||
date_posted=date_posted,
|
||||
delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
|
||||
job_url=job_url,
|
||||
)
|
||||
job_list.append(job_post)
|
||||
if len(job_list) >= scraper_input.results_wanted:
|
||||
|
|
|
@ -38,8 +38,11 @@ class ZipRecruiterScraper(Scraper):
|
|||
params = {
|
||||
"search": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
"page": page,
|
||||
"radius": scraper_input.distance,
|
||||
"refine_by_location_type": "only_remote"
|
||||
if scraper_input.is_remote
|
||||
else None,
|
||||
"page": page,
|
||||
}
|
||||
|
||||
response = session.get(
|
||||
|
@ -88,7 +91,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
job_type=job_type,
|
||||
compensation=ZipRecruiterScraper.get_compensation(job),
|
||||
date_posted=date_posted,
|
||||
delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
|
||||
job_url=job_url,
|
||||
)
|
||||
job_list.append(job_post)
|
||||
if len(job_list) >= scraper_input.results_wanted:
|
||||
|
|
Loading…
Reference in New Issue