mirror of https://github.com/Bunsly/JobSpy
feat(ziprecruiter): Add multithreading for individual job handling
parent
8b04508b15
commit
eb728a572a
|
@ -83,10 +83,15 @@ class ZipRecruiterScraper(Scraper):
|
||||||
|
|
||||||
job_posts = soup.find_all("div", {"class": "job_content"})
|
job_posts = soup.find_all("div", {"class": "job_content"})
|
||||||
|
|
||||||
for job in job_posts:
|
def process_job(job: Tag) -> Optional[JobPost]:
|
||||||
|
'''
|
||||||
|
Parses a job from the job content tag
|
||||||
|
:param job: BeautifulSoup Tag for one job post
|
||||||
|
:return JobPost
|
||||||
|
'''
|
||||||
job_url = job.find("a", {"class": "job_link"})["href"]
|
job_url = job.find("a", {"class": "job_link"})["href"]
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
continue
|
return None
|
||||||
|
|
||||||
title = job.find("h2", {"class": "title"}).text
|
title = job.find("h2", {"class": "title"}).text
|
||||||
company = job.find("a", {"class": "company_name"}).text.strip()
|
company = job.find("a", {"class": "company_name"}).text.strip()
|
||||||
|
@ -121,7 +126,14 @@ class ZipRecruiterScraper(Scraper):
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
)
|
)
|
||||||
job_list.append(job_post)
|
return job_post
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
|
job_results: list[Future] = [
|
||||||
|
executor.submit(process_job, job) for job in job_posts
|
||||||
|
]
|
||||||
|
|
||||||
|
job_list = [result.result() for result in job_results if result.result()]
|
||||||
|
|
||||||
return job_list, job_count
|
return job_list, job_count
|
||||||
|
|
||||||
|
@ -171,12 +183,14 @@ class ZipRecruiterScraper(Scraper):
|
||||||
return job_response
|
return job_response
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_description(cls, job_page_url: str, session: tls_client.Session) -> Tuple[Optional[str], str]:
|
def get_description(
|
||||||
|
cls, job_page_url: str, session: tls_client.Session
|
||||||
|
) -> Tuple[Optional[str], str]:
|
||||||
"""
|
"""
|
||||||
Retrieves job description by going to the job page url
|
Retrieves job description by going to the job page url
|
||||||
:param job_page_url:
|
:param job_page_url:
|
||||||
:param session:
|
:param session:
|
||||||
:return: description or None, response url
|
:return: description or None, response url
|
||||||
"""
|
"""
|
||||||
response = session.get(
|
response = session.get(
|
||||||
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
|
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
|
||||||
|
|
Loading…
Reference in New Issue