feat(jobs): urgent kws

pull/56/head
Cullen Watson 2023-10-10 10:10:16 -05:00
parent ecdbf69f94
commit a4d08ffb77
6 changed files with 21 additions and 42 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.11"
version = "1.1.12"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy"

View File

@ -98,8 +98,8 @@ def scrape_jobs(
site_to_jobs_dict = {}
def worker(site):
site_value, scraped_data = scrape_site(site)
return site_value, scraped_data
site_val, scraped_info = scrape_site(site)
return site_val, scraped_info
with ThreadPoolExecutor() as executor:
future_to_site = {

View File

@ -182,12 +182,12 @@ class JobPost(BaseModel):
job_url: str
location: Optional[Location]
description: Optional[str] = None
job_type: Optional[list[JobType]] = None
compensation: Optional[Compensation] = None
date_posted: Optional[date] = None
benefits: Optional[str] = None
emails: Optional[list[str]] = None
description: str | None = None
job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None
benefits: str | None = None
emails: list[str] | None = None
num_urgent_words: int | None = None
# is_remote: bool | None = None
# company_industry: str | None = None

View File

@ -18,6 +18,7 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text
from ...jobs import (
JobPost,
Compensation,
@ -28,12 +29,6 @@ from ...jobs import (
)
from .. import Scraper, ScraperInput, Site
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)
class IndeedScraper(Scraper):
def __init__(self, proxy: Optional[str] = None):
@ -144,7 +139,6 @@ class IndeedScraper(Scraper):
date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url, session)
emails = extract_emails_from_text(description)
with io.StringIO(job["snippet"]) as f:
soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup_io.find_all("li")
@ -160,11 +154,12 @@ class IndeedScraper(Scraper):
state=job.get("jobLocationState"),
country=self.country,
),
emails=extract_emails_from_text(description),
job_type=job_type,
compensation=compensation,
date_posted=date_posted,
job_url=job_url_client,
emails=extract_emails_from_text(description),
num_urgent_words=count_urgent_words(description)
)
return job_post

View File

@ -17,6 +17,7 @@ from bs4.element import Tag
from threading import Lock
from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text
from ..exceptions import LinkedInException
from ...jobs import (
JobPost,
@ -26,13 +27,6 @@ from ...jobs import (
)
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)
class LinkedInScraper(Scraper):
MAX_RETRIES = 3
DELAY = 10
@ -180,7 +174,8 @@ class LinkedInScraper(Scraper):
job_url=job_url,
job_type=job_type,
benefits=benefits,
emails=extract_emails_from_text(description)
emails=extract_emails_from_text(description),
num_urgent_words=count_urgent_words(description)
)
def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[
@ -207,7 +202,7 @@ class LinkedInScraper(Scraper):
def get_job_type(
soup_job_type: BeautifulSoup,
) -> JobType | None:
) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
@ -238,7 +233,7 @@ class LinkedInScraper(Scraper):
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return job_type
return list[job_type]
return None
def get_location(self, metadata_card: Optional[Tag]) -> Location:
@ -263,9 +258,3 @@ class LinkedInScraper(Scraper):
)
return location
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)

View File

@ -19,6 +19,7 @@ from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text
from ...jobs import (
JobPost,
Compensation,
@ -29,12 +30,6 @@ from ...jobs import (
Country,
)
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)
class ZipRecruiterScraper(Scraper):
def __init__(self, proxy: Optional[str] = None):
@ -181,6 +176,7 @@ class ZipRecruiterScraper(Scraper):
date_posted=date_posted,
job_url=job_url,
emails=extract_emails_from_text(description),
num_urgent_words=count_urgent_words(description)
)
return job_post
@ -291,11 +287,10 @@ class ZipRecruiterScraper(Scraper):
return job_post
@staticmethod
def get_job_type_enum(job_type_str: str) -> Optional[JobType]:
def get_job_type_enum(job_type_str: str) -> Optional[list[JobType]]:
for job_type in JobType:
if job_type_str in job_type.value:
a = True
return job_type
return [job_type]
return None
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]: