diff --git a/pyproject.toml b/pyproject.toml index 618a68f..2817eac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.11" +version = "1.1.12" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/cullenwatson/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 2970ec8..60eee05 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -98,8 +98,8 @@ def scrape_jobs( site_to_jobs_dict = {} def worker(site): - site_value, scraped_data = scrape_site(site) - return site_value, scraped_data + site_val, scraped_info = scrape_site(site) + return site_val, scraped_info with ThreadPoolExecutor() as executor: future_to_site = { diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 98467aa..93bdd69 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -182,12 +182,12 @@ class JobPost(BaseModel): job_url: str location: Optional[Location] - description: Optional[str] = None - job_type: Optional[list[JobType]] = None - compensation: Optional[Compensation] = None - date_posted: Optional[date] = None - benefits: Optional[str] = None - emails: Optional[list[str]] = None + description: str | None = None + job_type: list[JobType] | None = None + compensation: Compensation | None = None + date_posted: date | None = None + benefits: str | None = None + emails: list[str] | None = None num_urgent_words: int | None = None # is_remote: bool | None = None # company_industry: str | None = None diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 611b0c3..65d1144 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -18,6 +18,7 @@ from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor, Future from ..exceptions import IndeedException +from ..utils import count_urgent_words, extract_emails_from_text from ...jobs import ( JobPost, Compensation, @@ -28,12 +29,6 @@ from ...jobs import ( ) from .. import Scraper, ScraperInput, Site -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) - class IndeedScraper(Scraper): def __init__(self, proxy: Optional[str] = None): @@ -144,7 +139,6 @@ class IndeedScraper(Scraper): date_posted = date_posted.strftime("%Y-%m-%d") description = self.get_description(job_url, session) - emails = extract_emails_from_text(description) with io.StringIO(job["snippet"]) as f: soup_io = BeautifulSoup(f, "html.parser") li_elements = soup_io.find_all("li") @@ -160,11 +154,12 @@ class IndeedScraper(Scraper): state=job.get("jobLocationState"), country=self.country, ), - emails=extract_emails_from_text(description), job_type=job_type, compensation=compensation, date_posted=date_posted, job_url=job_url_client, + emails=extract_emails_from_text(description), + num_urgent_words=count_urgent_words(description) ) return job_post diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 8331d36..2e7df9a 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -17,6 +17,7 @@ from bs4.element import Tag from threading import Lock from .. import Scraper, ScraperInput, Site +from ..utils import count_urgent_words, extract_emails_from_text from ..exceptions import LinkedInException from ...jobs import ( JobPost, @@ -26,13 +27,6 @@ from ...jobs import ( ) -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) - - class LinkedInScraper(Scraper): MAX_RETRIES = 3 DELAY = 10 @@ -180,7 +174,8 @@ class LinkedInScraper(Scraper): job_url=job_url, job_type=job_type, benefits=benefits, - emails=extract_emails_from_text(description) + emails=extract_emails_from_text(description), + num_urgent_words=count_urgent_words(description) ) def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[ @@ -207,7 +202,7 @@ class LinkedInScraper(Scraper): def get_job_type( soup_job_type: BeautifulSoup, - ) -> JobType | None: + ) -> list[JobType] | None: """ Gets the job type from job page :param soup_job_type: @@ -238,7 +233,7 @@ class LinkedInScraper(Scraper): def get_enum_from_value(value_str): for job_type in JobType: if value_str in job_type.value: - return job_type + return list[job_type] return None def get_location(self, metadata_card: Optional[Tag]) -> Location: @@ -263,9 +258,3 @@ class LinkedInScraper(Scraper): ) return location - -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) \ No newline at end of file diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index c1a0fee..0373e66 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -19,6 +19,7 @@ from concurrent.futures import ThreadPoolExecutor, Future from .. import Scraper, ScraperInput, Site from ..exceptions import ZipRecruiterException +from ..utils import count_urgent_words, extract_emails_from_text from ...jobs import ( JobPost, Compensation, @@ -29,12 +30,6 @@ from ...jobs import ( Country, ) -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) - class ZipRecruiterScraper(Scraper): def __init__(self, proxy: Optional[str] = None): @@ -181,6 +176,7 @@ class ZipRecruiterScraper(Scraper): date_posted=date_posted, job_url=job_url, emails=extract_emails_from_text(description), + num_urgent_words=count_urgent_words(description) ) return job_post @@ -291,11 +287,10 @@ class ZipRecruiterScraper(Scraper): return job_post @staticmethod - def get_job_type_enum(job_type_str: str) -> Optional[JobType]: + def get_job_type_enum(job_type_str: str) -> Optional[list[JobType]]: for job_type in JobType: if job_type_str in job_type.value: - a = True - return job_type + return [job_type] return None def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]: