mirror of https://github.com/Bunsly/JobSpy
feat(jobs): filter by is_remote
parent
2b63d4c84f
commit
d69c41392d
|
@ -37,25 +37,16 @@ class Compensation(BaseModel):
|
||||||
currency: str = "US"
|
currency: str = "US"
|
||||||
|
|
||||||
|
|
||||||
class DeliveryEnum(Enum):
|
|
||||||
EMAIL = "email"
|
|
||||||
URL = "url"
|
|
||||||
|
|
||||||
|
|
||||||
class Delivery(BaseModel):
|
|
||||||
method: DeliveryEnum
|
|
||||||
value: str
|
|
||||||
|
|
||||||
|
|
||||||
class JobPost(BaseModel):
|
class JobPost(BaseModel):
|
||||||
title: str
|
title: str
|
||||||
company_name: str
|
company_name: str
|
||||||
|
job_url: str
|
||||||
location: Location
|
location: Location
|
||||||
|
|
||||||
description: str = None
|
description: str = None
|
||||||
job_type: JobType = None
|
job_type: JobType = None
|
||||||
compensation: Compensation = None
|
compensation: Compensation = None
|
||||||
date_posted: datetime = None
|
date_posted: datetime = None
|
||||||
delivery: Delivery = None
|
|
||||||
|
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
|
|
|
@ -11,10 +11,11 @@ class Site(Enum):
|
||||||
|
|
||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: Site
|
site_type: Site
|
||||||
|
|
||||||
search_term: str
|
search_term: str
|
||||||
location: str
|
|
||||||
distance: int = 25
|
location: str = None
|
||||||
|
distance: int = None
|
||||||
|
is_remote: bool = False
|
||||||
|
|
||||||
results_wanted: int = 15
|
results_wanted: int = 15
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ class IndeedScraper(Scraper):
|
||||||
site = Site(Site.INDEED)
|
site = Site(Site.INDEED)
|
||||||
super().__init__(site)
|
super().__init__(site)
|
||||||
self.url = "https://www.indeed.com/jobs"
|
self.url = "https://www.indeed.com/jobs"
|
||||||
|
self.job_url = "https://www.indeed.com/viewjob?jk="
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -41,14 +42,18 @@ class IndeedScraper(Scraper):
|
||||||
while len(job_list) < scraper_input.results_wanted:
|
while len(job_list) < scraper_input.results_wanted:
|
||||||
params = {
|
params = {
|
||||||
"q": scraper_input.search_term,
|
"q": scraper_input.search_term,
|
||||||
"l": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
|
"radius": scraper_input.distance,
|
||||||
|
"sc": "0kf:attr(DSQF7);" if scraper_input.is_remote else None,
|
||||||
"filter": 0,
|
"filter": 0,
|
||||||
"start": 0 + page * 10,
|
"start": 0 + page * 10,
|
||||||
"radius": scraper_input.distance,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
response = session.get(self.url, params=params)
|
response = session.get(self.url, params=params)
|
||||||
|
|
||||||
|
if response.status_code == 307:
|
||||||
|
new_url = response.headers["Location"]
|
||||||
|
response = session.get(new_url)
|
||||||
if response.status_code != status.HTTP_200_OK:
|
if response.status_code != status.HTTP_200_OK:
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
success=False,
|
success=False,
|
||||||
|
@ -78,7 +83,7 @@ class IndeedScraper(Scraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
|
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
|
||||||
job_url = job["thirdPartyApplyUrl"]
|
job_url = f'{self.job_url}{job["jobkey"]}'
|
||||||
if job_url in seen_urls:
|
if job_url in seen_urls:
|
||||||
continue
|
continue
|
||||||
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
|
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
|
||||||
|
@ -104,10 +109,6 @@ class IndeedScraper(Scraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
job_type = IndeedScraper.get_job_type(job)
|
job_type = IndeedScraper.get_job_type(job)
|
||||||
if job.get("thirdPartyApplyUrl"):
|
|
||||||
delivery = Delivery(method=DeliveryEnum.URL, value=job_url)
|
|
||||||
else:
|
|
||||||
delivery = None
|
|
||||||
timestamp_seconds = job["pubDate"] / 1000
|
timestamp_seconds = job["pubDate"] / 1000
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||||
|
|
||||||
|
@ -117,15 +118,15 @@ class IndeedScraper(Scraper):
|
||||||
description=first_li.text if first_li else None,
|
description=first_li.text if first_li else None,
|
||||||
company_name=job["company"],
|
company_name=job["company"],
|
||||||
location=Location(
|
location=Location(
|
||||||
city=job["jobLocationCity"],
|
city=job.get("jobLocationCity"),
|
||||||
state=job["jobLocationState"],
|
state=job.get("jobLocationState"),
|
||||||
postal_code=job.get("jobLocationPostal"),
|
postal_code=job.get("jobLocationPostal"),
|
||||||
country="US",
|
country="US",
|
||||||
),
|
),
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
delivery=delivery,
|
job_url=job_url,
|
||||||
)
|
)
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
if len(job_list) >= scraper_input.results_wanted:
|
if len(job_list) >= scraper_input.results_wanted:
|
||||||
|
|
|
@ -17,7 +17,8 @@ class LinkedInScraper(Scraper):
|
||||||
site = Site(Site.LINKEDIN)
|
site = Site(Site.LINKEDIN)
|
||||||
super().__init__(site)
|
super().__init__(site)
|
||||||
|
|
||||||
self.url = "https://www.linkedin.com/jobs"
|
self.url = "https://www.linkedin.com/jobs/search/"
|
||||||
|
self.job_url = "https://www.linkedin.com/jobs/view/"
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -32,12 +33,13 @@ class LinkedInScraper(Scraper):
|
||||||
with requests.Session() as session:
|
with requests.Session() as session:
|
||||||
while len(job_list) < scraper_input.results_wanted:
|
while len(job_list) < scraper_input.results_wanted:
|
||||||
params = {
|
params = {
|
||||||
"pageNum": page,
|
"keywords": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"distance": scraper_input.distance,
|
"distance": scraper_input.distance,
|
||||||
|
"f_WT": 2 if scraper_input.is_remote else None,
|
||||||
|
"pageNum": page,
|
||||||
}
|
}
|
||||||
|
|
||||||
self.url = f"{self.url}/{scraper_input.search_term}-jobs"
|
|
||||||
response = session.get(self.url, params=params, allow_redirects=True)
|
response = session.get(self.url, params=params, allow_redirects=True)
|
||||||
|
|
||||||
if response.status_code != status.HTTP_200_OK:
|
if response.status_code != status.HTTP_200_OK:
|
||||||
|
@ -58,8 +60,11 @@ class LinkedInScraper(Scraper):
|
||||||
"div",
|
"div",
|
||||||
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
|
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
|
||||||
):
|
):
|
||||||
job_url_tag = job_card.find("a", class_="base-card__full-link")
|
data_entity_urn = job_card.get("data-entity-urn", "")
|
||||||
job_url = job_url_tag["href"] if job_url_tag else "N/A"
|
job_id = (
|
||||||
|
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
|
||||||
|
)
|
||||||
|
job_url = f"{self.job_url}{job_id}"
|
||||||
if job_url in seen_urls:
|
if job_url in seen_urls:
|
||||||
continue
|
continue
|
||||||
seen_urls.add(job_url)
|
seen_urls.add(job_url)
|
||||||
|
@ -91,7 +96,7 @@ class LinkedInScraper(Scraper):
|
||||||
company_name=company,
|
company_name=company,
|
||||||
location=location,
|
location=location,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
|
job_url=job_url,
|
||||||
)
|
)
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
if len(job_list) >= scraper_input.results_wanted:
|
if len(job_list) >= scraper_input.results_wanted:
|
||||||
|
|
|
@ -38,8 +38,11 @@ class ZipRecruiterScraper(Scraper):
|
||||||
params = {
|
params = {
|
||||||
"search": scraper_input.search_term,
|
"search": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"page": page,
|
|
||||||
"radius": scraper_input.distance,
|
"radius": scraper_input.distance,
|
||||||
|
"refine_by_location_type": "only_remote"
|
||||||
|
if scraper_input.is_remote
|
||||||
|
else None,
|
||||||
|
"page": page,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = session.get(
|
response = session.get(
|
||||||
|
@ -88,7 +91,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=ZipRecruiterScraper.get_compensation(job),
|
compensation=ZipRecruiterScraper.get_compensation(job),
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
|
job_url=job_url,
|
||||||
)
|
)
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
if len(job_list) >= scraper_input.results_wanted:
|
if len(job_list) >= scraper_input.results_wanted:
|
||||||
|
|
Loading…
Reference in New Issue