From e5353e604d13c8b48e4d671c187908c72b298a93 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Tue, 10 Oct 2023 11:23:04 -0500 Subject: [PATCH] Multiple job types for Indeed, urgent keywords column (#56) * enh(indeed): mult job types * feat(jobs): urgent kws * fix(indeed): use new session obj per request * fix: emails as comma separated in output * fix: put num urgent words in output * chore: readme --- README.md | 31 +-- examples/JobSpy_Demo.py | 14 +- pyproject.toml | 2 +- src/jobspy/__init__.py | 55 ++--- src/jobspy/jobs/__init__.py | 15 +- src/jobspy/scrapers/indeed/__init__.py | 105 +++++---- src/jobspy/scrapers/linkedin/__init__.py | 49 ++-- src/jobspy/scrapers/ziprecruiter/__init__.py | 231 ++++++++++--------- src/tests/test_all.py | 4 +- src/tests/test_indeed.py | 4 +- src/tests/test_linkedin.py | 4 +- src/tests/test_ziprecruiter.py | 4 +- 12 files changed, 271 insertions(+), 247 deletions(-) diff --git a/README.md b/README.md index 2f99193..269f436 100644 --- a/README.md +++ b/README.md @@ -33,37 +33,19 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) ```python from jobspy import scrape_jobs -import pandas as pd -jobs: pd.DataFrame = scrape_jobs( +jobs = scrape_jobs( site_name=["indeed", "linkedin", "zip_recruiter"], search_term="software engineer", location="Dallas, TX", results_wanted=10, - country_indeed='USA' # only needed for indeed - - # use if you want to use a proxy - # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", - # offset=25 # use if you want to start at a specific offset ) +print(f"Found {len(jobs)} jobs") +print(jobs.head()) +jobs.to_csv("jobs.csv", index=False) -# formatting for pandas -pd.set_option('display.max_columns', None) -pd.set_option('display.max_rows', None) -pd.set_option('display.width', None) -pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc - -# 1 output to console -print(jobs) - -# 2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) -# display(jobs) - -# 3 output to .csv -# jobs.to_csv('jobs.csv', index=False) - -# 4 output to .xlsx +# output to .xlsx # jobs.to_xlsx('jobs.xlsx', index=False) ``` @@ -117,6 +99,9 @@ JobPost │ ├── max_amount (int) │ └── currency (enum) └── date_posted (date) +└── emails (str) +└── num_urgent_words (int) +└── is_remote (bool) - just for Indeed at the momen ``` ### Exceptions diff --git a/examples/JobSpy_Demo.py b/examples/JobSpy_Demo.py index 598dcd0..c982793 100644 --- a/examples/JobSpy_Demo.py +++ b/examples/JobSpy_Demo.py @@ -6,23 +6,23 @@ jobs: pd.DataFrame = scrape_jobs( search_term="software engineer", location="Dallas, TX", results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) - country_indeed='USA', + country_indeed="USA", offset=25 # start jobs from an offset (use if search failed and want to continue) # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", ) # formatting for pandas -pd.set_option('display.max_columns', None) -pd.set_option('display.max_rows', None) -pd.set_option('display.width', None) -pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", None) +pd.set_option("display.width", None) +pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc # 1: output to console print(jobs) # 2: output to .csv -jobs.to_csv('./jobs.csv', index=False) -print('outputted to jobs.csv') +jobs.to_csv("./jobs.csv", index=False) +print("outputted to jobs.csv") # 3: output to .xlsx # jobs.to_xlsx('jobs.xlsx', index=False) diff --git a/pyproject.toml b/pyproject.toml index 2817eac..b277d13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.12" +version = "1.1.13" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/cullenwatson/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 061d73b..1bbbaf4 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -1,7 +1,7 @@ import pandas as pd import concurrent.futures from concurrent.futures import ThreadPoolExecutor -from typing import List, Tuple, Optional +from typing import Tuple, Optional from .jobs import JobType, Location from .scrapers.indeed import IndeedScraper @@ -26,18 +26,18 @@ def _map_str_to_site(site_name: str) -> Site: def scrape_jobs( - site_name: str | List[str] | Site | List[Site], - search_term: str, - location: str = "", - distance: int = None, - is_remote: bool = False, - job_type: str = None, - easy_apply: bool = False, # linkedin - results_wanted: int = 15, - country_indeed: str = "usa", - hyperlinks: bool = False, - proxy: Optional[str] = None, - offset: Optional[int] = 0 + site_name: str | list[str] | Site | list[Site], + search_term: str, + location: str = "", + distance: int = None, + is_remote: bool = False, + job_type: str = None, + easy_apply: bool = False, # linkedin + results_wanted: int = 15, + country_indeed: str = "usa", + hyperlinks: bool = False, + proxy: Optional[str] = None, + offset: Optional[int] = 0, ) -> pd.DataFrame: """ Simultaneously scrapes job data from multiple job sites. @@ -72,7 +72,7 @@ def scrape_jobs( job_type=job_type, easy_apply=easy_apply, results_wanted=results_wanted, - offset=offset + offset=offset, ) def scrape_site(site: Site) -> Tuple[str, JobResponse]: @@ -98,8 +98,8 @@ def scrape_jobs( site_to_jobs_dict = {} def worker(site): - site_value, scraped_data = scrape_site(site) - return site_value, scraped_data + site_val, scraped_info = scrape_site(site) + return site_val, scraped_info with ThreadPoolExecutor() as executor: future_to_site = { @@ -110,7 +110,7 @@ def scrape_jobs( site_value, scraped_data = future.result() site_to_jobs_dict[site_value] = scraped_data - jobs_dfs: List[pd.DataFrame] = [] + jobs_dfs: list[pd.DataFrame] = [] for site, job_response in site_to_jobs_dict.items(): for job in job_response.jobs: @@ -120,12 +120,14 @@ def scrape_jobs( ] = f'{job_data["job_url"]}' job_data["site"] = site job_data["company"] = job_data["company_name"] - if job_data["job_type"]: - # Take the first value from the job type tuple - job_data["job_type"] = job_data["job_type"].value[0] - else: - job_data["job_type"] = None - + job_data["job_type"] = ( + ", ".join(job_type.value[0] for job_type in job_data["job_type"]) + if job_data["job_type"] + else None + ) + job_data["emails"] = ( + ", ".join(job_data["emails"]) if job_data["emails"] else None + ) job_data["location"] = Location(**job_data["location"]).display_location() compensation_obj = job_data.get("compensation") @@ -149,7 +151,7 @@ def scrape_jobs( if jobs_dfs: jobs_df = pd.concat(jobs_dfs, ignore_index=True) - desired_order: List[str] = [ + desired_order: list[str] = [ "job_url_hyper" if hyperlinks else "job_url", "site", "title", @@ -158,12 +160,13 @@ def scrape_jobs( "job_type", "date_posted", "interval", - "benefits", "min_amount", "max_amount", "currency", + "is_remote", + "num_urgent_words", + "benefits", "emails", - "job_url_hyper" if hyperlinks else "job_url", "description", ] jobs_formatted_df = jobs_df[desired_order] diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index dc948c2..543d6c7 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -182,12 +182,15 @@ class JobPost(BaseModel): job_url: str location: Optional[Location] - description: Optional[str] = None - job_type: Optional[JobType] = None - compensation: Optional[Compensation] = None - date_posted: Optional[date] = None - benefits: Optional[str] = None - emails: Optional[list[str]] = None + description: str | None = None + job_type: list[JobType] | None = None + compensation: Compensation | None = None + date_posted: date | None = None + benefits: str | None = None + emails: list[str] | None = None + num_urgent_words: int | None = None + is_remote: bool | None = None + # company_industry: str | None = None class JobResponse(BaseModel): diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 3a2543b..cccd62c 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -9,15 +9,14 @@ import math import io import json from datetime import datetime -from typing import Optional -import tls_client import urllib.parse from bs4 import BeautifulSoup from bs4.element import Tag from concurrent.futures import ThreadPoolExecutor, Future from ..exceptions import IndeedException +from ..utils import count_urgent_words, extract_emails_from_text, create_session from ...jobs import ( JobPost, Compensation, @@ -28,15 +27,9 @@ from ...jobs import ( ) from .. import Scraper, ScraperInput, Site -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) - class IndeedScraper(Scraper): - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxy: str | None = None): """ Initializes IndeedScraper with the Indeed job search url """ @@ -49,20 +42,18 @@ class IndeedScraper(Scraper): self.seen_urls = set() def scrape_page( - self, scraper_input: ScraperInput, page: int, session: tls_client.Session + self, scraper_input: ScraperInput, page: int ) -> tuple[list[JobPost], int]: """ Scrapes a page of Indeed for jobs with scraper_input criteria :param scraper_input: :param page: - :param session: :return: jobs found on page, total number of jobs found for search """ self.country = scraper_input.country domain = self.country.domain_value self.url = f"https://{domain}.indeed.com" - - job_list: list[JobPost] = [] + session = create_session(self.proxy) params = { "q": scraper_input.search_term, @@ -84,9 +75,9 @@ class IndeedScraper(Scraper): try: response = session.get( f"{self.url}/jobs", + headers=self.get_headers(), params=params, allow_redirects=True, - proxy=self.proxy, timeout_seconds=10, ) if response.status_code not in range(200, 400): @@ -108,13 +99,13 @@ class IndeedScraper(Scraper): total_num_jobs = IndeedScraper.total_jobs(soup) if ( - not jobs.get("metaData", {}) - .get("mosaicProviderJobCardsModel", {}) - .get("results") + not jobs.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results") ): raise IndeedException("No jobs found.") - def process_job(job) -> Optional[JobPost]: + def process_job(job) -> JobPost | None: job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}' job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}' if job_url in self.seen_urls: @@ -143,8 +134,7 @@ class IndeedScraper(Scraper): date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = date_posted.strftime("%Y-%m-%d") - description = self.get_description(job_url, session) - emails = extract_emails_from_text(description) + description = self.get_description(job_url) with io.StringIO(job["snippet"]) as f: soup_io = BeautifulSoup(f, "html.parser") li_elements = soup_io.find_all("li") @@ -160,11 +150,15 @@ class IndeedScraper(Scraper): state=job.get("jobLocationState"), country=self.country, ), - emails=extract_emails_from_text(description), job_type=job_type, compensation=compensation, date_posted=date_posted, job_url=job_url_client, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) + if description + else None, + is_remote=self.is_remote_job(job), ) return job_post @@ -184,20 +178,16 @@ class IndeedScraper(Scraper): :param scraper_input: :return: job_response """ - session = tls_client.Session( - client_identifier="chrome112", random_tls_extension_order=True - ) - pages_to_process = ( - math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 + math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 ) #: get first page to initialize session - job_list, total_results = self.scrape_page(scraper_input, 0, session) + job_list, total_results = self.scrape_page(scraper_input, 0) with ThreadPoolExecutor(max_workers=1) as executor: futures: list[Future] = [ - executor.submit(self.scrape_page, scraper_input, page, session) + executor.submit(self.scrape_page, scraper_input, page) for page in range(1, pages_to_process + 1) ] @@ -215,21 +205,24 @@ class IndeedScraper(Scraper): ) return job_response - def get_description(self, job_page_url: str, session: tls_client.Session) -> Optional[str]: + def get_description(self, job_page_url: str) -> str | None: """ Retrieves job description by going to the job page url :param job_page_url: - :param session: :return: description """ parsed_url = urllib.parse.urlparse(job_page_url) params = urllib.parse.parse_qs(parsed_url.query) jk_value = params.get("jk", [None])[0] formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" + session = create_session(self.proxy) try: response = session.get( - formatted_url, allow_redirects=True, timeout_seconds=5, proxy=self.proxy + formatted_url, + headers=self.get_headers(), + allow_redirects=True, + timeout_seconds=5, ) except Exception as e: return None @@ -246,20 +239,23 @@ class IndeedScraper(Scraper): return text_content @staticmethod - def get_job_type(job: dict) -> Optional[JobType]: + def get_job_type(job: dict) -> list[JobType] | None: """ - Parses the job to get JobTypeIndeed + Parses the job to get list of job types :param job: :return: """ + job_types: list[JobType] = [] for taxonomy in job["taxonomyAttributes"]: if taxonomy["label"] == "job-types": - if len(taxonomy["attributes"]) > 0: - label = taxonomy["attributes"][0].get("label") + for i in range(len(taxonomy["attributes"])): + label = taxonomy["attributes"][i].get("label") if label: job_type_str = label.replace("-", "").replace(" ", "").lower() - return IndeedScraper.get_enum_from_job_type(job_type_str) - return None + job_types.append( + IndeedScraper.get_enum_from_job_type(job_type_str) + ) + return job_types @staticmethod def get_enum_from_job_type(job_type_str): @@ -280,7 +276,7 @@ class IndeedScraper(Scraper): :return: jobs """ - def find_mosaic_script() -> Optional[Tag]: + def find_mosaic_script() -> Tag | None: """ Finds jobcards script tag :return: script_tag @@ -289,9 +285,9 @@ class IndeedScraper(Scraper): for tag in script_tags: if ( - tag.string - and "mosaic.providerData" in tag.string - and "mosaic-provider-jobcards" in tag.string + tag.string + and "mosaic.providerData" in tag.string + and "mosaic-provider-jobcards" in tag.string ): return tag return None @@ -330,3 +326,30 @@ class IndeedScraper(Scraper): data = json.loads(json_str) total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"]) return total_num_jobs + + @staticmethod + def get_headers(): + return { + "authority": "www.indeed.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "referer": "https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw", + "sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + } + + @staticmethod + def is_remote_job(job: dict) -> bool: + """ + :param job: + :return: bool + """ + for taxonomy in job.get("taxonomyAttributes", []): + if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0: + return True + return False diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 8331d36..c0681d6 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -17,6 +17,7 @@ from bs4.element import Tag from threading import Lock from .. import Scraper, ScraperInput, Site +from ..utils import count_urgent_words, extract_emails_from_text from ..exceptions import LinkedInException from ...jobs import ( JobPost, @@ -26,13 +27,6 @@ from ...jobs import ( ) -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) - - class LinkedInScraper(Scraper): MAX_RETRIES = 3 DELAY = 10 @@ -99,13 +93,15 @@ class LinkedInScraper(Scraper): break except requests.HTTPError as e: - if hasattr(e, 'response') and e.response is not None: + if hasattr(e, "response") and e.response is not None: if e.response.status_code == 429: time.sleep(self.DELAY) retries += 1 continue else: - raise LinkedInException(f"bad response status code: {e.response.status_code}") + raise LinkedInException( + f"bad response status code: {e.response.status_code}" + ) else: raise except ProxyError as e: @@ -114,7 +110,9 @@ class LinkedInScraper(Scraper): raise LinkedInException(str(e)) else: # Raise an exception if the maximum number of retries is reached - raise LinkedInException("Max retries reached, failed to get a valid response") + raise LinkedInException( + "Max retries reached, failed to get a valid response" + ) soup = BeautifulSoup(response.text, "html.parser") @@ -141,7 +139,9 @@ class LinkedInScraper(Scraper): if job_post: job_list.append(job_post) except Exception as e: - raise LinkedInException("Exception occurred while processing jobs") + raise LinkedInException( + "Exception occurred while processing jobs" + ) page += 25 job_list = job_list[: scraper_input.results_wanted] @@ -158,7 +158,11 @@ class LinkedInScraper(Scraper): metadata_card = job_card.find("div", class_="base-search-card__metadata") location = self.get_location(metadata_card) - datetime_tag = metadata_card.find("time", class_="job-search-card__listdate") if metadata_card else None + datetime_tag = ( + metadata_card.find("time", class_="job-search-card__listdate") + if metadata_card + else None + ) date_posted = None if datetime_tag and "datetime" in datetime_tag.attrs: datetime_str = datetime_tag["datetime"] @@ -178,13 +182,16 @@ class LinkedInScraper(Scraper): location=location, date_posted=date_posted, job_url=job_url, + # job_type=[JobType.FULL_TIME], job_type=job_type, benefits=benefits, - emails=extract_emails_from_text(description) + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, ) - def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[ - str | None, tuple[str | None, JobType | None]]: + def get_job_description( + self, job_page_url: str + ) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]: """ Retrieves job description by going to the job page url :param job_page_url: @@ -206,8 +213,8 @@ class LinkedInScraper(Scraper): description = " ".join(div_content.get_text().split()).strip() def get_job_type( - soup_job_type: BeautifulSoup, - ) -> JobType | None: + soup_job_type: BeautifulSoup, + ) -> list[JobType] | None: """ Gets the job type from job page :param soup_job_type: @@ -238,7 +245,7 @@ class LinkedInScraper(Scraper): def get_enum_from_value(value_str): for job_type in JobType: if value_str in job_type.value: - return job_type + return [job_type] return None def get_location(self, metadata_card: Optional[Tag]) -> Location: @@ -263,9 +270,3 @@ class LinkedInScraper(Scraper): ) return location - -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) \ No newline at end of file diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index c1a0fee..b9b41a6 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -11,7 +11,6 @@ from datetime import datetime, date from typing import Optional, Tuple, Any from urllib.parse import urlparse, parse_qs, urlunparse -import tls_client import requests from bs4 import BeautifulSoup from bs4.element import Tag @@ -19,6 +18,7 @@ from concurrent.futures import ThreadPoolExecutor, Future from .. import Scraper, ScraperInput, Site from ..exceptions import ZipRecruiterException +from ..utils import count_urgent_words, extract_emails_from_text, create_session from ...jobs import ( JobPost, Compensation, @@ -29,12 +29,6 @@ from ...jobs import ( Country, ) -def extract_emails_from_text(text: str) -> Optional[list[str]]: - if not text: - return None - email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") - return email_regex.findall(text) - class ZipRecruiterScraper(Scraper): def __init__(self, proxy: Optional[str] = None): @@ -47,12 +41,9 @@ class ZipRecruiterScraper(Scraper): self.jobs_per_page = 20 self.seen_urls = set() - self.session = tls_client.Session( - client_identifier="chrome112", random_tls_extension_order=True - ) def find_jobs_in_page( - self, scraper_input: ScraperInput, page: int + self, scraper_input: ScraperInput, page: int ) -> list[JobPost]: """ Scrapes a page of ZipRecruiter for jobs with scraper_input criteria @@ -60,14 +51,13 @@ class ZipRecruiterScraper(Scraper): :param page: :return: jobs found on page """ - job_list: list[JobPost] = [] + session = create_session(self.proxy) try: - response = self.session.get( + response = session.get( f"{self.url}/jobs-search", - headers=ZipRecruiterScraper.headers(), - params=ZipRecruiterScraper.add_params(scraper_input, page), + headers=self.headers(), + params=self.add_params(scraper_input, page), allow_redirects=True, - proxy=self.proxy, timeout_seconds=10, ) if response.status_code != 200: @@ -121,7 +111,11 @@ class ZipRecruiterScraper(Scraper): :param scraper_input: :return: job_response """ - start_page = (scraper_input.offset // self.jobs_per_page) + 1 if scraper_input.offset else 1 + start_page = ( + (scraper_input.offset // self.jobs_per_page) + 1 + if scraper_input.offset + else 1 + ) #: get first page to initialize session job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page) pages_to_process = max( @@ -142,91 +136,10 @@ class ZipRecruiterScraper(Scraper): job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) - def process_job_html_1(self, job: Tag) -> Optional[JobPost]: - """ - Parses a job from the job content tag - :param job: BeautifulSoup Tag for one job post - :return JobPost - TODO this method isnt finished due to not encountering this type of html often - """ - job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"]) - if job_url in self.seen_urls: - return None - - title = job.find("h2", {"class": "title"}).text - company = job.find("a", {"class": "company_name"}).text.strip() - - description, updated_job_url = self.get_description(job_url) - # job_url = updated_job_url if updated_job_url else job_url - if description is None: - description = job.find("p", {"class": "job_snippet"}).text.strip() - - job_type_element = job.find("li", {"class": "perk_item perk_type"}) - job_type = None - if job_type_element: - job_type_text = ( - job_type_element.text.strip().lower().replace("_", "").replace(" ", "") - ) - job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) - - date_posted = ZipRecruiterScraper.get_date_posted(job) - - job_post = JobPost( - title=title, - description=description, - company_name=company, - location=ZipRecruiterScraper.get_location(job), - job_type=job_type, - compensation=ZipRecruiterScraper.get_compensation(job), - date_posted=date_posted, - job_url=job_url, - emails=extract_emails_from_text(description), - ) - return job_post - - def process_job_html_2(self, job: Tag) -> Optional[JobPost]: - """ - Parses a job from the job content tag for a second variat of HTML that ZR uses - :param job: BeautifulSoup Tag for one job post - :return JobPost - """ - job_url = self.cleanurl(job.find("a", class_="job_link")["href"]) - title = job.find("h2", class_="title").text - company = job.find("a", class_="company_name").text.strip() - - description, updated_job_url = self.get_description(job_url) - # job_url = updated_job_url if updated_job_url else job_url - if description is None: - description = job.find("p", class_="job_snippet").get_text().strip() - - job_type_text = job.find("li", class_="perk_item perk_type") - job_type = None - if job_type_text: - job_type_text = ( - job_type_text.get_text() - .strip() - .lower() - .replace("-", "") - .replace(" ", "") - ) - job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) - date_posted = ZipRecruiterScraper.get_date_posted(job) - - job_post = JobPost( - title=title, - description=description, - company_name=company, - location=ZipRecruiterScraper.get_location(job), - job_type=job_type, - compensation=ZipRecruiterScraper.get_compensation(job), - date_posted=date_posted, - job_url=job_url, - ) - return job_post - def process_job_javascript(self, job: dict) -> JobPost: + """the most common type of jobs page on ZR""" title = job.get("Title") - job_url = self.cleanurl(job.get("JobURL")) + job_url = job.get("JobURL") description, updated_job_url = self.get_description(job_url) # job_url = updated_job_url if updated_job_url else job_url @@ -280,38 +193,126 @@ class ZipRecruiterScraper(Scraper): return JobPost( title=title, - description=description, company_name=company, location=location, job_type=job_type, compensation=compensation, date_posted=date_posted, job_url=job_url, + description=description, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, + ) + + def process_job_html_2(self, job: Tag) -> Optional[JobPost]: + """ + second most common type of jobs page on ZR after process_job_javascript() + Parses a job from the job content tag for a second variat of HTML that ZR uses + :param job: BeautifulSoup Tag for one job post + :return JobPost + """ + job_url = job.find("a", class_="job_link")["href"] + title = job.find("h2", class_="title").text + company = job.find("a", class_="company_name").text.strip() + + description, updated_job_url = self.get_description(job_url) + # job_url = updated_job_url if updated_job_url else job_url + if description is None: + description = job.find("p", class_="job_snippet").get_text().strip() + + job_type_text = job.find("li", class_="perk_item perk_type") + job_type = None + if job_type_text: + job_type_text = ( + job_type_text.get_text() + .strip() + .lower() + .replace("-", "") + .replace(" ", "") + ) + job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) + date_posted = ZipRecruiterScraper.get_date_posted(job) + + job_post = JobPost( + title=title, + company_name=company, + location=ZipRecruiterScraper.get_location(job), + job_type=job_type, + compensation=ZipRecruiterScraper.get_compensation(job), + date_posted=date_posted, + job_url=job_url, + description=description, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, + ) + return job_post + + def process_job_html_1(self, job: Tag) -> Optional[JobPost]: + """ + TODO this method isnt finished due to not encountering this type of html often + least common type of jobs page on ZR (rarely found) + Parses a job from the job content tag + :param job: BeautifulSoup Tag for one job post + :return JobPost + """ + job_url = job.find("a", {"class": "job_link"})["href"] + # job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"]) + if job_url in self.seen_urls: + return None + + title = job.find("h2", {"class": "title"}).text + company = job.find("a", {"class": "company_name"}).text.strip() + + description, _ = self.get_description(job_url) + # job_url = updated_job_url if updated_job_url else job_url + # get description from jobs listing page if get_description from the specific job page fails + if description is None: + description = job.find("p", {"class": "job_snippet"}).text.strip() + + job_type_element = job.find("li", {"class": "perk_item perk_type"}) + job_type = None + if job_type_element: + job_type_text = ( + job_type_element.text.strip().lower().replace("_", "").replace(" ", "") + ) + job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text) + + date_posted = ZipRecruiterScraper.get_date_posted(job) + + job_post = JobPost( + title=title, + description=description, + company_name=company, + location=ZipRecruiterScraper.get_location(job), + job_type=job_type, + compensation=ZipRecruiterScraper.get_compensation(job), + date_posted=date_posted, + job_url=job_url, + emails=extract_emails_from_text(description), + num_urgent_words=count_urgent_words(description), ) return job_post @staticmethod - def get_job_type_enum(job_type_str: str) -> Optional[JobType]: + def get_job_type_enum(job_type_str: str) -> list[JobType] | None: for job_type in JobType: if job_type_str in job_type.value: - a = True - return job_type + return [job_type] return None - def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]: + def get_description(self, job_page_url: str) -> Tuple[str | None, str | None]: """ Retrieves job description by going to the job page url :param job_page_url: - :param session: :return: description or None, response url """ try: - response = requests.get( + session = create_session(self.proxy) + response = session.get( job_page_url, - headers=ZipRecruiterScraper.headers(), + headers=self.headers(), allow_redirects=True, - timeout=5, - proxies=self.proxy, + timeout_seconds=5, ) if response.status_code not in range(200, 400): return None, None @@ -467,8 +468,8 @@ class ZipRecruiterScraper(Scraper): "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" } - @staticmethod - def cleanurl(url): - parsed_url = urlparse(url) - - return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', '')) + # @staticmethod + # def cleanurl(url) -> str: + # parsed_url = urlparse(url) + # + # return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', '')) diff --git a/src/tests/test_all.py b/src/tests/test_all.py index 749be79..5ffd333 100644 --- a/src/tests/test_all.py +++ b/src/tests/test_all.py @@ -9,4 +9,6 @@ def test_all(): results_wanted=5, ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame" diff --git a/src/tests/test_indeed.py b/src/tests/test_indeed.py index 8c6412e..280215f 100644 --- a/src/tests/test_indeed.py +++ b/src/tests/test_indeed.py @@ -7,4 +7,6 @@ def test_indeed(): site_name="indeed", search_term="software engineer", ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame" diff --git a/src/tests/test_linkedin.py b/src/tests/test_linkedin.py index 5814134..8db0a62 100644 --- a/src/tests/test_linkedin.py +++ b/src/tests/test_linkedin.py @@ -7,4 +7,6 @@ def test_linkedin(): site_name="linkedin", search_term="software engineer", ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame" diff --git a/src/tests/test_ziprecruiter.py b/src/tests/test_ziprecruiter.py index f2d2212..cd1c8ee 100644 --- a/src/tests/test_ziprecruiter.py +++ b/src/tests/test_ziprecruiter.py @@ -8,4 +8,6 @@ def test_ziprecruiter(): search_term="software engineer", ) - assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame"