Fix Indeed exceptions on parsing description

pull/62/head v1.1.14
Cullen Watson 2023-10-18 14:25:53 -05:00 committed by GitHub
parent 5e71866630
commit f2cc74b7f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 79 additions and 48 deletions

View File

@ -92,9 +92,9 @@ JobPost
│ ├── city (str) │ ├── city (str)
│ ├── state (str) │ ├── state (str)
├── description (str) ├── description (str)
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (str): fulltime, parttime, internship, contract
├── compensation (object) ├── compensation (object)
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly │ ├── interval (str): yearly, monthly, weekly, daily, hourly
│ ├── min_amount (int) │ ├── min_amount (int)
│ ├── max_amount (int) │ ├── max_amount (int)
│ └── currency (enum) │ └── currency (enum)

View File

@ -1,9 +1,9 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.13" version = "1.1.14"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/cullenwatson/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md" readme = "README.md"
packages = [ packages = [
@ -16,6 +16,7 @@ requests = "^2.31.0"
tls-client = "^0.2.1" tls-client = "^0.2.1"
beautifulsoup4 = "^4.12.2" beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0" pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0" pydantic = "^2.3.0"

View File

@ -84,13 +84,12 @@ def scrape_jobs(
except (LinkedInException, IndeedException, ZipRecruiterException) as lie: except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie raise lie
except Exception as e: except Exception as e:
# unhandled exceptions
if site == Site.LINKEDIN: if site == Site.LINKEDIN:
raise LinkedInException() raise LinkedInException(str(e))
if site == Site.INDEED: if site == Site.INDEED:
raise IndeedException() raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER: if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException() raise ZipRecruiterException(str(e))
else: else:
raise e raise e
return site.value, scraped_data return site.value, scraped_data

View File

@ -37,10 +37,16 @@ class JobType(Enum):
"повназайнятість", "повназайнятість",
"toànthờigian", "toànthờigian",
) )
PART_TIME = ("parttime", "teilzeit") PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
CONTRACT = ("contract", "contractor") CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",) TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum") INTERNSHIP = (
"internship",
"prácticas",
"ojt(onthejobtraining)",
"praktikum",
"praktik",
)
PER_DIEM = ("perdiem",) PER_DIEM = ("perdiem",)
NIGHTS = ("nights",) NIGHTS = ("nights",)

View File

@ -7,12 +7,15 @@ This module contains the set of Scrapers' exceptions.
class LinkedInException(Exception): class LinkedInException(Exception):
"""Failed to scrape LinkedIn""" def __init__(self, message=None):
super().__init__(message or "An error occurred with LinkedIn")
class IndeedException(Exception): class IndeedException(Exception):
"""Failed to scrape Indeed""" def __init__(self, message=None):
super().__init__(message or "An error occurred with Indeed")
class ZipRecruiterException(Exception): class ZipRecruiterException(Exception):
"""Failed to scrape ZipRecruiter""" def __init__(self, message=None):
super().__init__(message or "An error occurred with ZipRecruiter")

View File

@ -16,7 +16,12 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session from ..utils import (
count_urgent_words,
extract_emails_from_text,
create_session,
get_enum_from_job_type,
)
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -162,10 +167,10 @@ class IndeedScraper(Scraper):
) )
return job_post return job_post
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor: with ThreadPoolExecutor(max_workers=1) as executor:
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(process_job, job) executor.submit(process_job, job) for job in jobs
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
@ -230,12 +235,36 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][ soup = BeautifulSoup(response.text, "html.parser")
"jobInfoModel" script_tag = soup.find(
]["sanitizedJobDescription"] "script", text=lambda x: x and "window._initialData" in x
with io.StringIO(raw_description) as f: )
soup = BeautifulSoup(f, "html.parser")
text_content = " ".join(soup.get_text().split()).strip() if not script_tag:
return None
script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(
job_description, "html.parser"
)
text_content = " ".join(
soup.get_text(separator=" ").split()
).strip()
return text_content return text_content
@staticmethod @staticmethod
@ -252,22 +281,11 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][i].get("label") label = taxonomy["attributes"][i].get("label")
if label: if label:
job_type_str = label.replace("-", "").replace(" ", "").lower() job_type_str = label.replace("-", "").replace(" ", "").lower()
job_types.append( job_type = get_enum_from_job_type(job_type_str)
IndeedScraper.get_enum_from_job_type(job_type_str) if job_type:
) job_types.append(job_type)
return job_types return job_types
@staticmethod
def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType:
"""
for job_type in JobType:
if job_type_str in job_type.value:
return job_type
return None
@staticmethod @staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict: def parse_jobs(soup: BeautifulSoup) -> dict:
""" """

View File

@ -9,7 +9,6 @@ from datetime import datetime
import requests import requests
import time import time
import re
from requests.exceptions import ProxyError from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -17,7 +16,7 @@ from bs4.element import Tag
from threading import Lock from threading import Lock
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -237,17 +236,10 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower() employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "") employment_type = employment_type.replace("-", "")
return LinkedInScraper.get_enum_from_value(employment_type) return [get_enum_from_job_type(employment_type)]
return description, get_job_type(soup) return description, get_job_type(soup)
@staticmethod
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return [job_type]
return None
def get_location(self, metadata_card: Optional[Tag]) -> Location: def get_location(self, metadata_card: Optional[Tag]) -> Location:
""" """
Extracts the location data from the job metadata card. Extracts the location data from the job metadata card.

View File

@ -1,5 +1,6 @@
import re import re
import tls_client import tls_client
from ..jobs import JobType
def count_urgent_words(description: str) -> int: def count_urgent_words(description: str) -> int:
@ -42,3 +43,14 @@ def create_session(proxy: str | None = None):
# } # }
return session return session
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
"""
Given a string, returns the corresponding JobType enum member if a match is found.
"""
res = None
for job_type in JobType:
if job_type_str in job_type.value:
res = job_type
return res