Merge pull request #8 from JobSpy-ai/features/parallel-page-processing

Features/parallel page processing
pull/12/head
Cullen 2023-07-11 12:25:14 -05:00 committed by GitHub
commit b36cfaeca2
5 changed files with 279 additions and 210 deletions

View File

@ -2,7 +2,7 @@ from typing import Union
from datetime import datetime
from enum import Enum
from pydantic import BaseModel
from pydantic import BaseModel, validator
class JobType(Enum):
@ -57,5 +57,13 @@ class JobResponse(BaseModel):
success: bool
error: str = None
job_count: int = None
total_results: int = None
returned_results: int = None
jobs: list[JobPost] = []
@validator("returned_results")
def set_returned_results(cls, v, values):
if v is None and values.get("jobs"):
return len(values["jobs"])
return v

View File

@ -1,6 +1,11 @@
from ..jobs import *
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"

View File

@ -1,6 +1,6 @@
import re
import json
from typing import Optional
from typing import Optional, Tuple, List
import tls_client
from bs4 import BeautifulSoup
@ -8,7 +8,11 @@ from bs4.element import Tag
from fastapi import status
from api.core.jobs import *
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future
import math
class ParsingException(Exception):
@ -25,21 +29,22 @@ class IndeedScraper(Scraper):
self.url = "https://www.indeed.com/jobs"
self.job_url = "https://www.indeed.com/viewjob?jk="
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
self.jobs_per_page = 15
self.seen_urls = set()
def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
) -> tuple[list[JobPost], int]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input:
:param page:
:param session:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
job_list: list[JobPost] = []
page = 0
processed_jobs, total_num_jobs = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
params = {
"q": scraper_input.search_term,
"location": scraper_input.location,
@ -61,21 +66,13 @@ class IndeedScraper(Scraper):
response.status_code != status.HTTP_200_OK
and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT
):
return JobResponse(
success=False,
error=f"Response returned {response.status_code}",
)
raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")
try:
jobs = IndeedScraper.parse_jobs(soup)
except ParsingException:
return JobResponse(
success=False,
error="Failed to parse jobs.",
)
jobs = IndeedScraper.parse_jobs(
soup
) #: can raise exception, handled by main scrape function
total_num_jobs = IndeedScraper.total_jobs(soup)
if (
@ -83,25 +80,20 @@ class IndeedScraper(Scraper):
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
return JobResponse(
success=False,
error="No jobs found",
)
raise Exception("No jobs found.")
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
processed_jobs += 1
job_url = f'{self.job_url}{job["jobkey"]}'
if job_url in seen_urls:
if job_url in self.seen_urls:
continue
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
extracted_salary = job.get("extractedSalary")
compensation = None
if extracted_salary:
salary_snippet = job.get("salarySnippet")
currency = (
salary_snippet.get("currency") if salary_snippet else None
)
currency = salary_snippet.get("currency") if salary_snippet else None
interval = (extracted_salary.get("type"),)
if isinstance(interval, tuple):
interval = interval[0]
@ -136,24 +128,61 @@ class IndeedScraper(Scraper):
job_url=job_url,
)
job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= total_num_jobs
):
break
page += 1
return job_list, total_num_jobs
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = (
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
)
try:
#: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 0, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page, session)
for page in range(1, pages_to_process + 1)
]
for future in futures:
jobs, _ = future.result()
job_list += jobs
except StatusException as e:
return JobResponse(
success=False,
error=f"Indeed returned status code {e.status_code}",
)
except ParsingException as e:
return JobResponse(
success=False,
error=f"Indeed failed to parse response: {e}",
)
except Exception as e:
return JobResponse(
success=False,
error=f"Indeed failed to scrape: {e}",
)
if len(job_list) > scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse(
success=True,
jobs=job_list,
job_count=total_num_jobs,
total_results=total_results,
)
return job_response

View File

@ -131,7 +131,7 @@ class LinkedInScraper(Scraper):
job_response = JobResponse(
success=True,
jobs=job_list,
job_count=job_count,
total_results=job_count,
)
return job_response

View File

@ -1,13 +1,16 @@
import json
from typing import Optional
from typing import Optional, Tuple, List
from urllib.parse import urlparse, parse_qs
import tls_client
from fastapi import status
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, Future
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import JobPost
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
from api.core.jobs import *
import math
class ZipRecruiterScraper(Scraper):
@ -19,22 +22,22 @@ class ZipRecruiterScraper(Scraper):
super().__init__(site)
self.url = "https://www.ziprecruiter.com/jobs-search"
self.jobs_per_page = 20
self.seen_urls = set()
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
) -> tuple[list[JobPost], int | None]:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
:param page:
:param session:
:return: jobs found on page, total number of jobs found for search
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
job_list: list[JobPost] = []
page = 1
processed_jobs, job_count = 0, 0
seen_urls = set()
while len(job_list) < scraper_input.results_wanted:
job_list = []
job_type_value = None
if scraper_input.job_type:
if scraper_input.job_type.value == "fulltime":
@ -60,12 +63,9 @@ class ZipRecruiterScraper(Scraper):
response = session.get(
self.url, headers=ZipRecruiterScraper.headers(), params=params
)
print(response.url)
if response.status_code != status.HTTP_200_OK:
return JobResponse(
success=False,
error=f"Response returned {response.status_code}",
)
raise StatusException(response.status_code)
html_string = response.content
soup = BeautifulSoup(html_string, "html.parser")
@ -73,16 +73,17 @@ class ZipRecruiterScraper(Scraper):
script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string)
job_count = data["totalJobCount"]
job_count = int(job_count.replace(",", ""))
job_count = int(data["totalJobCount"].replace(",", ""))
else:
job_count = None
job_posts = soup.find_all("div", {"class": "job_content"})
for job in job_posts:
processed_jobs += 1
job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in seen_urls:
if job_url in self.seen_urls:
continue
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description = job.find("p", {"class": "job_snippet"}).text.strip()
@ -114,25 +115,51 @@ class ZipRecruiterScraper(Scraper):
job_url=job_url,
)
job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
return job_list, job_count
page += 1
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
try:
#: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 1, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page, session)
for page in range(2, pages_to_process + 1)
]
for future in futures:
jobs, _ = future.result()
job_list += jobs
except StatusException as e:
return JobResponse(
success=False,
error=f"ZipRecruiter returned status code {e.status_code}",
)
#: note: this does not handle if the results are more or less than the results_wanted
if len(job_list) > scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse(
success=True,
jobs=job_list,
job_count=job_count,
total_results=total_results,
)
return job_response