Merge pull request #8 from JobSpy-ai/features/parallel-page-processing

Features/parallel page processing
pull/12/head
Cullen 2023-07-11 12:25:14 -05:00 committed by GitHub
commit b36cfaeca2
5 changed files with 279 additions and 210 deletions

View File

@ -2,7 +2,7 @@ from typing import Union
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from pydantic import BaseModel from pydantic import BaseModel, validator
class JobType(Enum): class JobType(Enum):
@ -57,5 +57,13 @@ class JobResponse(BaseModel):
success: bool success: bool
error: str = None error: str = None
job_count: int = None total_results: int = None
returned_results: int = None
jobs: list[JobPost] = [] jobs: list[JobPost] = []
@validator("returned_results")
def set_returned_results(cls, v, values):
if v is None and values.get("jobs"):
return len(values["jobs"])
return v

View File

@ -1,6 +1,11 @@
from ..jobs import * from ..jobs import *
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class Site(Enum): class Site(Enum):
LINKEDIN = "linkedin" LINKEDIN = "linkedin"
INDEED = "indeed" INDEED = "indeed"

View File

@ -1,6 +1,6 @@
import re import re
import json import json
from typing import Optional from typing import Optional, Tuple, List
import tls_client import tls_client
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -8,7 +8,11 @@ from bs4.element import Tag
from fastapi import status from fastapi import status
from api.core.jobs import * from api.core.jobs import *
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future
import math
class ParsingException(Exception): class ParsingException(Exception):
@ -25,21 +29,22 @@ class IndeedScraper(Scraper):
self.url = "https://www.indeed.com/jobs" self.url = "https://www.indeed.com/jobs"
self.job_url = "https://www.indeed.com/viewjob?jk=" self.job_url = "https://www.indeed.com/viewjob?jk="
def scrape(self, scraper_input: ScraperInput) -> JobResponse: self.jobs_per_page = 15
""" self.seen_urls = set()
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input: def scrape_page(
:return: job_response self, scraper_input: ScraperInput, page: int, session: tls_client.Session
""" ) -> tuple[list[JobPost], int]:
session = tls_client.Session( """
client_identifier="chrome112", random_tls_extension_order=True Scrapes a page of Indeed for jobs with scraper_input criteria
) :param scraper_input:
:param page:
:param session:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
job_list: list[JobPost] = []
page = 0
processed_jobs, total_num_jobs = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@ -61,21 +66,13 @@ class IndeedScraper(Scraper):
response.status_code != status.HTTP_200_OK response.status_code != status.HTTP_200_OK
and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT
): ):
return JobResponse( raise StatusException(response.status_code)
success=False,
error=f"Response returned {response.status_code}",
)
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
try: jobs = IndeedScraper.parse_jobs(
jobs = IndeedScraper.parse_jobs(soup) soup
except ParsingException: ) #: can raise exception, handled by main scrape function
return JobResponse(
success=False,
error="Failed to parse jobs.",
)
total_num_jobs = IndeedScraper.total_jobs(soup) total_num_jobs = IndeedScraper.total_jobs(soup)
if ( if (
@ -83,25 +80,20 @@ class IndeedScraper(Scraper):
.get("mosaicProviderJobCardsModel", {}) .get("mosaicProviderJobCardsModel", {})
.get("results") .get("results")
): ):
return JobResponse( raise Exception("No jobs found.")
success=False,
error="No jobs found",
)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
processed_jobs += 1
job_url = f'{self.job_url}{job["jobkey"]}' job_url = f'{self.job_url}{job["jobkey"]}'
if job_url in seen_urls: if job_url in self.seen_urls:
continue continue
snippet_html = BeautifulSoup(job["snippet"], "html.parser") snippet_html = BeautifulSoup(job["snippet"], "html.parser")
extracted_salary = job.get("extractedSalary") extracted_salary = job.get("extractedSalary")
compensation = None compensation = None
if extracted_salary: if extracted_salary:
salary_snippet = job.get("salarySnippet") salary_snippet = job.get("salarySnippet")
currency = ( currency = salary_snippet.get("currency") if salary_snippet else None
salary_snippet.get("currency") if salary_snippet else None
)
interval = (extracted_salary.get("type"),) interval = (extracted_salary.get("type"),)
if isinstance(interval, tuple): if isinstance(interval, tuple):
interval = interval[0] interval = interval[0]
@ -136,24 +128,61 @@ class IndeedScraper(Scraper):
job_url=job_url, job_url=job_url,
) )
job_list.append(job_post) job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
if ( return job_list, total_num_jobs
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
page += 1
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = (
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
)
try:
#: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 0, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page, session)
for page in range(1, pages_to_process + 1)
]
for future in futures:
jobs, _ = future.result()
job_list += jobs
except StatusException as e:
return JobResponse(
success=False,
error=f"Indeed returned status code {e.status_code}",
)
except ParsingException as e:
return JobResponse(
success=False,
error=f"Indeed failed to parse response: {e}",
)
except Exception as e:
return JobResponse(
success=False,
error=f"Indeed failed to scrape: {e}",
)
if len(job_list) > scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=total_num_jobs, total_results=total_results,
) )
return job_response return job_response

View File

@ -131,7 +131,7 @@ class LinkedInScraper(Scraper):
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=job_count, total_results=job_count,
) )
return job_response return job_response

View File

@ -1,13 +1,16 @@
import json import json
from typing import Optional from typing import Optional, Tuple, List
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
from fastapi import status from fastapi import status
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, Future
from api.core.scrapers import Scraper, ScraperInput, Site from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from api.core.jobs import * from api.core.jobs import *
import math
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@ -19,22 +22,22 @@ class ZipRecruiterScraper(Scraper):
super().__init__(site) super().__init__(site)
self.url = "https://www.ziprecruiter.com/jobs-search" self.url = "https://www.ziprecruiter.com/jobs-search"
self.jobs_per_page = 20
self.seen_urls = set()
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
) -> tuple[list[JobPost], int | None]:
""" """
Scrapes ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:return: job_response :param page:
:param session:
:return: jobs found on page, total number of jobs found for search
""" """
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
job_list: list[JobPost] = [] job_list = []
page = 1
processed_jobs, job_count = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
job_type_value = None job_type_value = None
if scraper_input.job_type: if scraper_input.job_type:
if scraper_input.job_type.value == "fulltime": if scraper_input.job_type.value == "fulltime":
@ -60,12 +63,9 @@ class ZipRecruiterScraper(Scraper):
response = session.get( response = session.get(
self.url, headers=ZipRecruiterScraper.headers(), params=params self.url, headers=ZipRecruiterScraper.headers(), params=params
) )
print(response.url)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
return JobResponse( raise StatusException(response.status_code)
success=False,
error=f"Response returned {response.status_code}",
)
html_string = response.content html_string = response.content
soup = BeautifulSoup(html_string, "html.parser") soup = BeautifulSoup(html_string, "html.parser")
@ -73,16 +73,17 @@ class ZipRecruiterScraper(Scraper):
script_tag = soup.find("script", {"id": "js_variables"}) script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string) data = json.loads(script_tag.string)
job_count = data["totalJobCount"] job_count = int(data["totalJobCount"].replace(",", ""))
job_count = int(job_count.replace(",", "")) else:
job_count = None
job_posts = soup.find_all("div", {"class": "job_content"}) job_posts = soup.find_all("div", {"class": "job_content"})
for job in job_posts: for job in job_posts:
processed_jobs += 1
job_url = job.find("a", {"class": "job_link"})["href"] job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in seen_urls: if job_url in self.seen_urls:
continue continue
title = job.find("h2", {"class": "title"}).text title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip() company = job.find("a", {"class": "company_name"}).text.strip()
description = job.find("p", {"class": "job_snippet"}).text.strip() description = job.find("p", {"class": "job_snippet"}).text.strip()
@ -114,25 +115,51 @@ class ZipRecruiterScraper(Scraper):
job_url=job_url, job_url=job_url,
) )
job_list.append(job_post) job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
if ( return job_list, job_count
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
page += 1 def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
try:
#: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 1, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page, session)
for page in range(2, pages_to_process + 1)
]
for future in futures:
jobs, _ = future.result()
job_list += jobs
except StatusException as e:
return JobResponse(
success=False,
error=f"ZipRecruiter returned status code {e.status_code}",
)
#: note: this does not handle if the results are more or less than the results_wanted
if len(job_list) > scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse( job_response = JobResponse(
success=True, success=True,
jobs=job_list, jobs=job_list,
job_count=job_count, total_results=total_results,
) )
return job_response return job_response