Fix Indeed exceptions on parsing description

pull/62/head v1.1.14
Cullen Watson 2023-10-18 14:25:53 -05:00 committed by GitHub
parent 5e71866630
commit f2cc74b7f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 79 additions and 48 deletions

View File

@ -92,9 +92,9 @@ JobPost
│ ├── city (str)
│ ├── state (str)
├── description (str)
├── job_type (enum): fulltime, parttime, internship, contract
├── job_type (str): fulltime, parttime, internship, contract
├── compensation (object)
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
│ ├── min_amount (int)
│ ├── max_amount (int)
│ └── currency (enum)

View File

@ -1,9 +1,9 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.13"
version = "1.1.14"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md"
packages = [
@ -16,6 +16,7 @@ requests = "^2.31.0"
tls-client = "^0.2.1"
beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"

View File

@ -84,13 +84,12 @@ def scrape_jobs(
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
# unhandled exceptions
if site == Site.LINKEDIN:
raise LinkedInException()
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException()
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException()
raise ZipRecruiterException(str(e))
else:
raise e
return site.value, scraped_data

View File

@ -37,10 +37,16 @@ class JobType(Enum):
"повназайнятість",
"toànthờigian",
)
PART_TIME = ("parttime", "teilzeit")
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum")
INTERNSHIP = (
"internship",
"prácticas",
"ojt(onthejobtraining)",
"praktikum",
"praktik",
)
PER_DIEM = ("perdiem",)
NIGHTS = ("nights",)

View File

@ -7,12 +7,15 @@ This module contains the set of Scrapers' exceptions.
class LinkedInException(Exception):
"""Failed to scrape LinkedIn"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with LinkedIn")
class IndeedException(Exception):
"""Failed to scrape Indeed"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with Indeed")
class ZipRecruiterException(Exception):
"""Failed to scrape ZipRecruiter"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with ZipRecruiter")

View File

@ -16,7 +16,12 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ..utils import (
count_urgent_words,
extract_emails_from_text,
create_session,
get_enum_from_job_type,
)
from ...jobs import (
JobPost,
Compensation,
@ -162,10 +167,10 @@ class IndeedScraper(Scraper):
)
return job_post
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor:
job_results: list[Future] = [
executor.submit(process_job, job)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
executor.submit(process_job, job) for job in jobs
]
job_list = [result.result() for result in job_results if result.result()]
@ -230,12 +235,36 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400):
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
with io.StringIO(raw_description) as f:
soup = BeautifulSoup(f, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find(
"script", text=lambda x: x and "window._initialData" in x
)
if not script_tag:
return None
script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(
job_description, "html.parser"
)
text_content = " ".join(
soup.get_text(separator=" ").split()
).strip()
return text_content
@staticmethod
@ -252,22 +281,11 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][i].get("label")
if label:
job_type_str = label.replace("-", "").replace(" ", "").lower()
job_types.append(
IndeedScraper.get_enum_from_job_type(job_type_str)
)
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
@staticmethod
def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType:
"""
for job_type in JobType:
if job_type_str in job_type.value:
return job_type
return None
@staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict:
"""

View File

@ -9,7 +9,6 @@ from datetime import datetime
import requests
import time
import re
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
@ -17,7 +16,7 @@ from bs4.element import Tag
from threading import Lock
from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..exceptions import LinkedInException
from ...jobs import (
JobPost,
@ -237,17 +236,10 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return LinkedInScraper.get_enum_from_value(employment_type)
return [get_enum_from_job_type(employment_type)]
return description, get_job_type(soup)
@staticmethod
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return [job_type]
return None
def get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.

View File

@ -1,5 +1,6 @@
import re
import tls_client
from ..jobs import JobType
def count_urgent_words(description: str) -> int:
@ -42,3 +43,14 @@ def create_session(proxy: str | None = None):
# }
return session
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
"""
Given a string, returns the corresponding JobType enum member if a match is found.
"""
res = None
for job_type in JobType:
if job_type_str in job_type.value:
res = job_type
return res