mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-06 12:34:32 -08:00
Validation error (#35)
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from ..jobs import Enum, BaseModel, JobType, JobResponse
|
||||
from typing import List, Dict, Optional, Any
|
||||
from typing import List, Optional, Any
|
||||
|
||||
|
||||
class StatusException(Exception):
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
import re
|
||||
import sys
|
||||
import math
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, List
|
||||
from typing import Optional
|
||||
|
||||
import tls_client
|
||||
import urllib.parse
|
||||
@@ -11,7 +10,14 @@ from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
|
||||
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
CompensationInterval,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
)
|
||||
from .. import Scraper, ScraperInput, Site, StatusException
|
||||
|
||||
|
||||
@@ -61,10 +67,7 @@ class IndeedScraper(Scraper):
|
||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||
response = session.get(self.url + "/jobs", params=params)
|
||||
|
||||
if (
|
||||
response.status_code != 200
|
||||
and response.status_code != 307
|
||||
):
|
||||
if response.status_code != 200 and response.status_code != 307:
|
||||
raise StatusException(response.status_code)
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
@@ -136,8 +139,10 @@ class IndeedScraper(Scraper):
|
||||
return job_post
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
job_results: list[Future] = [executor.submit(process_job, job) for job in
|
||||
jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]]
|
||||
job_results: list[Future] = [
|
||||
executor.submit(process_job, job)
|
||||
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||
]
|
||||
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
|
||||
|
||||
@@ -6,7 +6,14 @@ from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ...jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
Compensation,
|
||||
CompensationInterval,
|
||||
)
|
||||
|
||||
|
||||
class LinkedInScraper(Scraper):
|
||||
@@ -117,7 +124,9 @@ class LinkedInScraper(Scraper):
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
job_type=job_type,
|
||||
compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD")
|
||||
compensation=Compensation(
|
||||
interval=CompensationInterval.YEARLY, currency="USD"
|
||||
),
|
||||
)
|
||||
job_list.append(job_post)
|
||||
if (
|
||||
|
||||
@@ -2,7 +2,7 @@ import math
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, List
|
||||
from typing import Optional, Tuple
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
import tls_client
|
||||
@@ -11,7 +11,14 @@ from bs4.element import Tag
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
|
||||
from .. import Scraper, ScraperInput, Site, StatusException
|
||||
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
CompensationInterval,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
)
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
@@ -55,7 +62,7 @@ class ZipRecruiterScraper(Scraper):
|
||||
"search": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
"page": page,
|
||||
"form": "jobs-landing"
|
||||
"form": "jobs-landing",
|
||||
}
|
||||
|
||||
if scraper_input.is_remote:
|
||||
@@ -65,7 +72,9 @@ class ZipRecruiterScraper(Scraper):
|
||||
params["radius"] = scraper_input.distance
|
||||
|
||||
if job_type_value:
|
||||
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
|
||||
params[
|
||||
"refine_by_employment"
|
||||
] = f"employment_type:employment_type:{job_type_value}"
|
||||
|
||||
response = self.session.get(
|
||||
self.url + "/jobs-search",
|
||||
@@ -90,11 +99,14 @@ class ZipRecruiterScraper(Scraper):
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
if "jobList" in data and data["jobList"]:
|
||||
jobs_js = data["jobList"]
|
||||
job_results = [executor.submit(self.process_job_js, job) for job in jobs_js]
|
||||
job_results = [
|
||||
executor.submit(self.process_job_js, job) for job in jobs_js
|
||||
]
|
||||
else:
|
||||
jobs_html = soup.find_all("div", {"class": "job_content"})
|
||||
job_results = [executor.submit(self.process_job_html, job) for job in
|
||||
jobs_html]
|
||||
job_results = [
|
||||
executor.submit(self.process_job_html, job) for job in jobs_html
|
||||
]
|
||||
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
|
||||
@@ -107,8 +119,9 @@ class ZipRecruiterScraper(Scraper):
|
||||
:return: job_response
|
||||
"""
|
||||
|
||||
|
||||
pages_to_process = max(3, math.ceil(scraper_input.results_wanted / self.jobs_per_page))
|
||||
pages_to_process = max(
|
||||
3, math.ceil(scraper_input.results_wanted / self.jobs_per_page)
|
||||
)
|
||||
|
||||
try:
|
||||
#: get first page to initialize session
|
||||
@@ -125,7 +138,6 @@ class ZipRecruiterScraper(Scraper):
|
||||
|
||||
job_list += jobs
|
||||
|
||||
|
||||
except StatusException as e:
|
||||
return JobResponse(
|
||||
success=False,
|
||||
@@ -162,9 +174,7 @@ class ZipRecruiterScraper(Scraper):
|
||||
title = job.find("h2", {"class": "title"}).text
|
||||
company = job.find("a", {"class": "company_name"}).text.strip()
|
||||
|
||||
description, updated_job_url = self.get_description(
|
||||
job_url
|
||||
)
|
||||
description, updated_job_url = self.get_description(job_url)
|
||||
if updated_job_url is not None:
|
||||
job_url = updated_job_url
|
||||
if description is None:
|
||||
@@ -173,10 +183,7 @@ class ZipRecruiterScraper(Scraper):
|
||||
job_type_element = job.find("li", {"class": "perk_item perk_type"})
|
||||
if job_type_element:
|
||||
job_type_text = (
|
||||
job_type_element.text.strip()
|
||||
.lower()
|
||||
.replace("-", "")
|
||||
.replace(" ", "")
|
||||
job_type_element.text.strip().lower().replace("-", "").replace(" ", "")
|
||||
)
|
||||
if job_type_text == "contractor":
|
||||
job_type_text = "contract"
|
||||
@@ -201,12 +208,16 @@ class ZipRecruiterScraper(Scraper):
|
||||
def process_job_js(self, job: dict) -> JobPost:
|
||||
# Map the job data to the expected fields by the Pydantic model
|
||||
title = job.get("Title")
|
||||
description = BeautifulSoup(job.get("Snippet","").strip(), "html.parser").get_text()
|
||||
description = BeautifulSoup(
|
||||
job.get("Snippet", "").strip(), "html.parser"
|
||||
).get_text()
|
||||
|
||||
company = job.get("OrgName")
|
||||
location = Location(city=job.get("City"), state=job.get("State"))
|
||||
try:
|
||||
job_type = ZipRecruiterScraper.job_type_from_string(job.get("EmploymentType", "").replace("-", "_").lower())
|
||||
job_type = ZipRecruiterScraper.job_type_from_string(
|
||||
job.get("EmploymentType", "").replace("-", "_").lower()
|
||||
)
|
||||
except ValueError:
|
||||
# print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}")
|
||||
return None
|
||||
@@ -215,14 +226,14 @@ class ZipRecruiterScraper(Scraper):
|
||||
salary_parts = formatted_salary.split(" ")
|
||||
|
||||
min_salary_str = salary_parts[0][1:].replace(",", "")
|
||||
if '.' in min_salary_str:
|
||||
if "." in min_salary_str:
|
||||
min_amount = int(float(min_salary_str) * 1000)
|
||||
else:
|
||||
min_amount = int(min_salary_str.replace("K", "000"))
|
||||
|
||||
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
|
||||
max_salary_str = salary_parts[2][1:].replace(",", "")
|
||||
if '.' in max_salary_str:
|
||||
if "." in max_salary_str:
|
||||
max_amount = int(float(max_salary_str) * 1000)
|
||||
else:
|
||||
max_amount = int(max_salary_str.replace("K", "000"))
|
||||
@@ -232,10 +243,12 @@ class ZipRecruiterScraper(Scraper):
|
||||
compensation = Compensation(
|
||||
interval=CompensationInterval.YEARLY,
|
||||
min_amount=min_amount,
|
||||
max_amount=max_amount
|
||||
max_amount=max_amount,
|
||||
)
|
||||
save_job_url = job.get("SaveJobURL", "")
|
||||
posted_time_match = re.search(r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url)
|
||||
posted_time_match = re.search(
|
||||
r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url
|
||||
)
|
||||
if posted_time_match:
|
||||
date_time_str = posted_time_match.group(1)
|
||||
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
|
||||
@@ -269,10 +282,7 @@ class ZipRecruiterScraper(Scraper):
|
||||
return item
|
||||
raise ValueError(f"Invalid value for JobType: {value}")
|
||||
|
||||
def get_description(
|
||||
self,
|
||||
job_page_url: str
|
||||
) -> Tuple[Optional[str], Optional[str]]:
|
||||
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Retrieves job description by going to the job page url
|
||||
:param job_page_url:
|
||||
|
||||
Reference in New Issue
Block a user