mirror of https://github.com/Bunsly/JobSpy
feat(jobs): urgent kws
parent
ecdbf69f94
commit
a4d08ffb77
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.11"
|
||||
version = "1.1.12"
|
||||
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||
|
|
|
@ -98,8 +98,8 @@ def scrape_jobs(
|
|||
site_to_jobs_dict = {}
|
||||
|
||||
def worker(site):
|
||||
site_value, scraped_data = scrape_site(site)
|
||||
return site_value, scraped_data
|
||||
site_val, scraped_info = scrape_site(site)
|
||||
return site_val, scraped_info
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
future_to_site = {
|
||||
|
|
|
@ -182,12 +182,12 @@ class JobPost(BaseModel):
|
|||
job_url: str
|
||||
location: Optional[Location]
|
||||
|
||||
description: Optional[str] = None
|
||||
job_type: Optional[list[JobType]] = None
|
||||
compensation: Optional[Compensation] = None
|
||||
date_posted: Optional[date] = None
|
||||
benefits: Optional[str] = None
|
||||
emails: Optional[list[str]] = None
|
||||
description: str | None = None
|
||||
job_type: list[JobType] | None = None
|
||||
compensation: Compensation | None = None
|
||||
date_posted: date | None = None
|
||||
benefits: str | None = None
|
||||
emails: list[str] | None = None
|
||||
num_urgent_words: int | None = None
|
||||
# is_remote: bool | None = None
|
||||
# company_industry: str | None = None
|
||||
|
|
|
@ -18,6 +18,7 @@ from bs4.element import Tag
|
|||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
|
||||
from ..exceptions import IndeedException
|
||||
from ..utils import count_urgent_words, extract_emails_from_text
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
|
@ -28,12 +29,6 @@ from ...jobs import (
|
|||
)
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
|
||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
||||
if not text:
|
||||
return None
|
||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
||||
return email_regex.findall(text)
|
||||
|
||||
|
||||
class IndeedScraper(Scraper):
|
||||
def __init__(self, proxy: Optional[str] = None):
|
||||
|
@ -144,7 +139,6 @@ class IndeedScraper(Scraper):
|
|||
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||
|
||||
description = self.get_description(job_url, session)
|
||||
emails = extract_emails_from_text(description)
|
||||
with io.StringIO(job["snippet"]) as f:
|
||||
soup_io = BeautifulSoup(f, "html.parser")
|
||||
li_elements = soup_io.find_all("li")
|
||||
|
@ -160,11 +154,12 @@ class IndeedScraper(Scraper):
|
|||
state=job.get("jobLocationState"),
|
||||
country=self.country,
|
||||
),
|
||||
emails=extract_emails_from_text(description),
|
||||
job_type=job_type,
|
||||
compensation=compensation,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url_client,
|
||||
emails=extract_emails_from_text(description),
|
||||
num_urgent_words=count_urgent_words(description)
|
||||
)
|
||||
return job_post
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ from bs4.element import Tag
|
|||
from threading import Lock
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import count_urgent_words, extract_emails_from_text
|
||||
from ..exceptions import LinkedInException
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
|
@ -26,13 +27,6 @@ from ...jobs import (
|
|||
)
|
||||
|
||||
|
||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
||||
if not text:
|
||||
return None
|
||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
||||
return email_regex.findall(text)
|
||||
|
||||
|
||||
class LinkedInScraper(Scraper):
|
||||
MAX_RETRIES = 3
|
||||
DELAY = 10
|
||||
|
@ -180,7 +174,8 @@ class LinkedInScraper(Scraper):
|
|||
job_url=job_url,
|
||||
job_type=job_type,
|
||||
benefits=benefits,
|
||||
emails=extract_emails_from_text(description)
|
||||
emails=extract_emails_from_text(description),
|
||||
num_urgent_words=count_urgent_words(description)
|
||||
)
|
||||
|
||||
def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[
|
||||
|
@ -207,7 +202,7 @@ class LinkedInScraper(Scraper):
|
|||
|
||||
def get_job_type(
|
||||
soup_job_type: BeautifulSoup,
|
||||
) -> JobType | None:
|
||||
) -> list[JobType] | None:
|
||||
"""
|
||||
Gets the job type from job page
|
||||
:param soup_job_type:
|
||||
|
@ -238,7 +233,7 @@ class LinkedInScraper(Scraper):
|
|||
def get_enum_from_value(value_str):
|
||||
for job_type in JobType:
|
||||
if value_str in job_type.value:
|
||||
return job_type
|
||||
return list[job_type]
|
||||
return None
|
||||
|
||||
def get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||
|
@ -263,9 +258,3 @@ class LinkedInScraper(Scraper):
|
|||
)
|
||||
|
||||
return location
|
||||
|
||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
||||
if not text:
|
||||
return None
|
||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
||||
return email_regex.findall(text)
|
|
@ -19,6 +19,7 @@ from concurrent.futures import ThreadPoolExecutor, Future
|
|||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import ZipRecruiterException
|
||||
from ..utils import count_urgent_words, extract_emails_from_text
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
|
@ -29,12 +30,6 @@ from ...jobs import (
|
|||
Country,
|
||||
)
|
||||
|
||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
||||
if not text:
|
||||
return None
|
||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
||||
return email_regex.findall(text)
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
def __init__(self, proxy: Optional[str] = None):
|
||||
|
@ -181,6 +176,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
emails=extract_emails_from_text(description),
|
||||
num_urgent_words=count_urgent_words(description)
|
||||
)
|
||||
return job_post
|
||||
|
||||
|
@ -291,11 +287,10 @@ class ZipRecruiterScraper(Scraper):
|
|||
return job_post
|
||||
|
||||
@staticmethod
|
||||
def get_job_type_enum(job_type_str: str) -> Optional[JobType]:
|
||||
def get_job_type_enum(job_type_str: str) -> Optional[list[JobType]]:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
a = True
|
||||
return job_type
|
||||
return [job_type]
|
||||
return None
|
||||
|
||||
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
|
Loading…
Reference in New Issue