mirror of https://github.com/Bunsly/JobSpy
feat(jobs): urgent kws
parent
ecdbf69f94
commit
a4d08ffb77
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.11"
|
version = "1.1.12"
|
||||||
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||||
|
|
|
@ -98,8 +98,8 @@ def scrape_jobs(
|
||||||
site_to_jobs_dict = {}
|
site_to_jobs_dict = {}
|
||||||
|
|
||||||
def worker(site):
|
def worker(site):
|
||||||
site_value, scraped_data = scrape_site(site)
|
site_val, scraped_info = scrape_site(site)
|
||||||
return site_value, scraped_data
|
return site_val, scraped_info
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
future_to_site = {
|
future_to_site = {
|
||||||
|
|
|
@ -182,12 +182,12 @@ class JobPost(BaseModel):
|
||||||
job_url: str
|
job_url: str
|
||||||
location: Optional[Location]
|
location: Optional[Location]
|
||||||
|
|
||||||
description: Optional[str] = None
|
description: str | None = None
|
||||||
job_type: Optional[list[JobType]] = None
|
job_type: list[JobType] | None = None
|
||||||
compensation: Optional[Compensation] = None
|
compensation: Compensation | None = None
|
||||||
date_posted: Optional[date] = None
|
date_posted: date | None = None
|
||||||
benefits: Optional[str] = None
|
benefits: str | None = None
|
||||||
emails: Optional[list[str]] = None
|
emails: list[str] | None = None
|
||||||
num_urgent_words: int | None = None
|
num_urgent_words: int | None = None
|
||||||
# is_remote: bool | None = None
|
# is_remote: bool | None = None
|
||||||
# company_industry: str | None = None
|
# company_industry: str | None = None
|
||||||
|
|
|
@ -18,6 +18,7 @@ from bs4.element import Tag
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
from ..exceptions import IndeedException
|
from ..exceptions import IndeedException
|
||||||
|
from ..utils import count_urgent_words, extract_emails_from_text
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Compensation,
|
Compensation,
|
||||||
|
@ -28,12 +29,6 @@ from ...jobs import (
|
||||||
)
|
)
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
|
|
||||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
|
||||||
return email_regex.findall(text)
|
|
||||||
|
|
||||||
|
|
||||||
class IndeedScraper(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
def __init__(self, proxy: Optional[str] = None):
|
def __init__(self, proxy: Optional[str] = None):
|
||||||
|
@ -144,7 +139,6 @@ class IndeedScraper(Scraper):
|
||||||
date_posted = date_posted.strftime("%Y-%m-%d")
|
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
description = self.get_description(job_url, session)
|
description = self.get_description(job_url, session)
|
||||||
emails = extract_emails_from_text(description)
|
|
||||||
with io.StringIO(job["snippet"]) as f:
|
with io.StringIO(job["snippet"]) as f:
|
||||||
soup_io = BeautifulSoup(f, "html.parser")
|
soup_io = BeautifulSoup(f, "html.parser")
|
||||||
li_elements = soup_io.find_all("li")
|
li_elements = soup_io.find_all("li")
|
||||||
|
@ -160,11 +154,12 @@ class IndeedScraper(Scraper):
|
||||||
state=job.get("jobLocationState"),
|
state=job.get("jobLocationState"),
|
||||||
country=self.country,
|
country=self.country,
|
||||||
),
|
),
|
||||||
emails=extract_emails_from_text(description),
|
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url_client,
|
job_url=job_url_client,
|
||||||
|
emails=extract_emails_from_text(description),
|
||||||
|
num_urgent_words=count_urgent_words(description)
|
||||||
)
|
)
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ from bs4.element import Tag
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
|
from ..utils import count_urgent_words, extract_emails_from_text
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
@ -26,13 +27,6 @@ from ...jobs import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
|
||||||
return email_regex.findall(text)
|
|
||||||
|
|
||||||
|
|
||||||
class LinkedInScraper(Scraper):
|
class LinkedInScraper(Scraper):
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
DELAY = 10
|
DELAY = 10
|
||||||
|
@ -180,7 +174,8 @@ class LinkedInScraper(Scraper):
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
benefits=benefits,
|
benefits=benefits,
|
||||||
emails=extract_emails_from_text(description)
|
emails=extract_emails_from_text(description),
|
||||||
|
num_urgent_words=count_urgent_words(description)
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[
|
def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[
|
||||||
|
@ -207,7 +202,7 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
def get_job_type(
|
def get_job_type(
|
||||||
soup_job_type: BeautifulSoup,
|
soup_job_type: BeautifulSoup,
|
||||||
) -> JobType | None:
|
) -> list[JobType] | None:
|
||||||
"""
|
"""
|
||||||
Gets the job type from job page
|
Gets the job type from job page
|
||||||
:param soup_job_type:
|
:param soup_job_type:
|
||||||
|
@ -238,7 +233,7 @@ class LinkedInScraper(Scraper):
|
||||||
def get_enum_from_value(value_str):
|
def get_enum_from_value(value_str):
|
||||||
for job_type in JobType:
|
for job_type in JobType:
|
||||||
if value_str in job_type.value:
|
if value_str in job_type.value:
|
||||||
return job_type
|
return list[job_type]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_location(self, metadata_card: Optional[Tag]) -> Location:
|
def get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||||
|
@ -263,9 +258,3 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
return location
|
return location
|
||||||
|
|
||||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
|
||||||
return email_regex.findall(text)
|
|
|
@ -19,6 +19,7 @@ from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import ZipRecruiterException
|
from ..exceptions import ZipRecruiterException
|
||||||
|
from ..utils import count_urgent_words, extract_emails_from_text
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Compensation,
|
Compensation,
|
||||||
|
@ -29,12 +30,6 @@ from ...jobs import (
|
||||||
Country,
|
Country,
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract_emails_from_text(text: str) -> Optional[list[str]]:
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
|
|
||||||
return email_regex.findall(text)
|
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
def __init__(self, proxy: Optional[str] = None):
|
def __init__(self, proxy: Optional[str] = None):
|
||||||
|
@ -181,6 +176,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
emails=extract_emails_from_text(description),
|
emails=extract_emails_from_text(description),
|
||||||
|
num_urgent_words=count_urgent_words(description)
|
||||||
)
|
)
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
|
@ -291,11 +287,10 @@ class ZipRecruiterScraper(Scraper):
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_job_type_enum(job_type_str: str) -> Optional[JobType]:
|
def get_job_type_enum(job_type_str: str) -> Optional[list[JobType]]:
|
||||||
for job_type in JobType:
|
for job_type in JobType:
|
||||||
if job_type_str in job_type.value:
|
if job_type_str in job_type.value:
|
||||||
a = True
|
return [job_type]
|
||||||
return job_type
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]:
|
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
|
Loading…
Reference in New Issue