From 4ec308a302e35b2a765a6bb73cee659c4011ff91 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 21 Feb 2025 14:14:55 -0600 Subject: [PATCH] refactor:organize code --- .github/workflows/publish-to-pypi.yml | 43 ++---- increment_version.py | 21 --- {src/jobspy => jobspy}/__init__.py | 109 ++++---------- .../scrapers => jobspy}/bayt/__init__.py | 20 +-- .../exceptions.py => jobspy/exception.py | 2 +- .../scrapers => jobspy}/glassdoor/__init__.py | 82 +++-------- .../glassdoor/constant.py | 0 jobspy/glassdoor/util.py | 42 ++++++ .../scrapers => jobspy}/google/__init__.py | 65 ++------- .../constants.py => jobspy/google/constant.py | 0 jobspy/google/util.py | 41 ++++++ .../scrapers => jobspy}/indeed/__init__.py | 124 +++------------- .../constants.py => jobspy/indeed/constant.py | 0 jobspy/indeed/util.py | 83 +++++++++++ .../scrapers => jobspy}/linkedin/__init__.py | 134 ++++-------------- .../linkedin/constant.py | 0 jobspy/linkedin/util.py | 85 +++++++++++ .../jobs/__init__.py => jobspy/model.py | 47 ++++++ .../scrapers/utils.py => jobspy/util.py | 61 +++++++- .../ziprecruiter/__init__.py | 94 +++--------- jobspy/ziprecruiter/constant.py | 29 ++++ jobspy/ziprecruiter/util.py | 31 ++++ pyproject.toml | 12 +- src/jobspy/scrapers/__init__.py | 58 -------- src/jobspy/scrapers/ziprecruiter/constants.py | 10 -- 25 files changed, 569 insertions(+), 624 deletions(-) delete mode 100644 increment_version.py rename {src/jobspy => jobspy}/__init__.py (70%) rename {src/jobspy/scrapers => jobspy}/bayt/__init__.py (95%) rename src/jobspy/scrapers/exceptions.py => jobspy/exception.py (97%) rename {src/jobspy/scrapers => jobspy}/glassdoor/__init__.py (83%) rename src/jobspy/scrapers/glassdoor/constants.py => jobspy/glassdoor/constant.py (100%) create mode 100644 jobspy/glassdoor/util.py rename {src/jobspy/scrapers => jobspy}/google/__init__.py (78%) rename src/jobspy/scrapers/google/constants.py => jobspy/google/constant.py (100%) create mode 100644 jobspy/google/util.py rename {src/jobspy/scrapers => jobspy}/indeed/__init__.py (70%) rename src/jobspy/scrapers/indeed/constants.py => jobspy/indeed/constant.py (100%) create mode 100644 jobspy/indeed/util.py rename {src/jobspy/scrapers => jobspy}/linkedin/__init__.py (77%) rename src/jobspy/scrapers/linkedin/constants.py => jobspy/linkedin/constant.py (100%) create mode 100644 jobspy/linkedin/util.py rename src/jobspy/jobs/__init__.py => jobspy/model.py (87%) rename src/jobspy/scrapers/utils.py => jobspy/util.py (86%) rename {src/jobspy/scrapers => jobspy}/ziprecruiter/__init__.py (73%) create mode 100644 jobspy/ziprecruiter/constant.py create mode 100644 jobspy/ziprecruiter/util.py delete mode 100644 src/jobspy/scrapers/__init__.py delete mode 100644 src/jobspy/scrapers/ziprecruiter/constants.py diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 366165c..6653e87 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -1,50 +1,33 @@ -name: Publish Python 🐍 distributions 📦 to PyPI -on: - pull_request: - types: - - closed - -permissions: - contents: write +name: Publish JobSpy to PyPi +on: push jobs: build-n-publish: - name: Build and publish Python 🐍 distributions 📦 to PyPI + name: Build and publish JobSpy to PyPi runs-on: ubuntu-latest - if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main' - steps: - uses: actions/checkout@v3 - - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.10" - - name: Install dependencies - run: pip install toml - - - name: Increment version - run: python increment_version.py - - - name: Commit version increment - run: | - git config --global user.name 'github-actions' - git config --global user.email 'github-actions@github.com' - git add pyproject.toml - git commit -m 'Increment version' - - - name: Push changes - run: git push - - name: Install poetry - run: pip install poetry --user + run: >- + python3 -m + pip install + poetry + --user - name: Build distribution 📦 - run: poetry build + run: >- + python3 -m + poetry + build - name: Publish distribution 📦 to PyPI + if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/increment_version.py b/increment_version.py deleted file mode 100644 index f359bd7..0000000 --- a/increment_version.py +++ /dev/null @@ -1,21 +0,0 @@ -import toml - -def increment_version(version): - major, minor, patch = map(int, version.split('.')) - patch += 1 - return f"{major}.{minor}.{patch}" - -# Load pyproject.toml -with open('pyproject.toml', 'r') as file: - pyproject = toml.load(file) - -# Increment the version -current_version = pyproject['tool']['poetry']['version'] -new_version = increment_version(current_version) -pyproject['tool']['poetry']['version'] = new_version - -# Save the updated pyproject.toml -with open('pyproject.toml', 'w') as file: - toml.dump(pyproject, file) - -print(f"Version updated from {current_version} to {new_version}") diff --git a/src/jobspy/__init__.py b/jobspy/__init__.py similarity index 70% rename from src/jobspy/__init__.py rename to jobspy/__init__.py index 8183338..ab57849 100644 --- a/src/jobspy/__init__.py +++ b/jobspy/__init__.py @@ -1,25 +1,27 @@ from __future__ import annotations -import pandas as pd -from typing import Tuple from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Tuple -from .jobs import JobType, Location -from .scrapers.utils import set_logger_level, extract_salary, create_logger -from .scrapers.indeed import IndeedScraper -from .scrapers.ziprecruiter import ZipRecruiterScraper -from .scrapers.glassdoor import GlassdoorScraper -from .scrapers.google import GoogleJobsScraper -from .scrapers.linkedin import LinkedInScraper -from .scrapers.bayt import BaytScraper -from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country -from .scrapers.exceptions import ( - LinkedInException, - IndeedException, - ZipRecruiterException, - GlassdoorException, - GoogleJobsException, +import pandas as pd + +from jobspy.bayt import BaytScraper +from jobspy.glassdoor import Glassdoor +from jobspy.google import Google +from jobspy.indeed import Indeed +from jobspy.linkedin import LinkedIn +from jobspy.model import JobType, Location, JobResponse, Country +from jobspy.model import SalarySource, ScraperInput, Site +from jobspy.util import ( + set_logger_level, + extract_salary, + create_logger, + get_enum_from_value, + map_str_to_site, + convert_to_annual, + desired_order, ) +from jobspy.ziprecruiter import ZipRecruiter def scrape_jobs( @@ -33,7 +35,6 @@ def scrape_jobs( easy_apply: bool | None = None, results_wanted: int = 15, country_indeed: str = "usa", - hyperlinks: bool = False, proxies: list[str] | str | None = None, ca_cert: str | None = None, description_format: str = "markdown", @@ -46,28 +47,18 @@ def scrape_jobs( **kwargs, ) -> pd.DataFrame: """ - Simultaneously scrapes job data from multiple job sites. - :return: pandas dataframe containing job data + Scrapes job data from job boards concurrently + :return: Pandas DataFrame containing job data """ SCRAPER_MAPPING = { - Site.LINKEDIN: LinkedInScraper, - Site.INDEED: IndeedScraper, - Site.ZIP_RECRUITER: ZipRecruiterScraper, - Site.GLASSDOOR: GlassdoorScraper, - Site.GOOGLE: GoogleJobsScraper, + Site.LINKEDIN: LinkedIn, + Site.INDEED: Indeed, + Site.ZIP_RECRUITER: ZipRecruiter, + Site.GLASSDOOR: Glassdoor, + Site.GOOGLE: Google, Site.BAYT: BaytScraper, } set_logger_level(verbose) - - def map_str_to_site(site_name: str) -> Site: - return Site[site_name.upper()] - - def get_enum_from_value(value_str): - for job_type in JobType: - if value_str in job_type.value: - return job_type - raise Exception(f"Invalid job type: {value_str}") - job_type = get_enum_from_value(job_type) if job_type else None def get_site_type(): @@ -127,28 +118,12 @@ def scrape_jobs( site_value, scraped_data = future.result() site_to_jobs_dict[site_value] = scraped_data - def convert_to_annual(job_data: dict): - if job_data["interval"] == "hourly": - job_data["min_amount"] *= 2080 - job_data["max_amount"] *= 2080 - if job_data["interval"] == "monthly": - job_data["min_amount"] *= 12 - job_data["max_amount"] *= 12 - if job_data["interval"] == "weekly": - job_data["min_amount"] *= 52 - job_data["max_amount"] *= 52 - if job_data["interval"] == "daily": - job_data["min_amount"] *= 260 - job_data["max_amount"] *= 260 - job_data["interval"] = "yearly" - jobs_dfs: list[pd.DataFrame] = [] for site, job_response in site_to_jobs_dict.items(): for job in job_response.jobs: job_data = job.dict() job_url = job_data["job_url"] - job_data["job_url_hyper"] = f'{job_url}' job_data["site"] = site job_data["company"] = job_data["company_name"] job_data["job_type"] = ( @@ -211,38 +186,6 @@ def scrape_jobs( # Step 2: Concatenate the filtered DataFrames jobs_df = pd.concat(filtered_dfs, ignore_index=True) - # Desired column order - desired_order = [ - "id", - "site", - "job_url_hyper" if hyperlinks else "job_url", - "job_url_direct", - "title", - "company", - "location", - "date_posted", - "job_type", - "salary_source", - "interval", - "min_amount", - "max_amount", - "currency", - "is_remote", - "job_level", - "job_function", - "listing_type", - "emails", - "description", - "company_industry", - "company_url", - "company_logo", - "company_url_direct", - "company_addresses", - "company_num_employees", - "company_revenue", - "company_description", - ] - # Step 3: Ensure all desired columns are present, adding missing ones as empty for column in desired_order: if column not in jobs_df.columns: diff --git a/src/jobspy/scrapers/bayt/__init__.py b/jobspy/bayt/__init__.py similarity index 95% rename from src/jobspy/scrapers/bayt/__init__.py rename to jobspy/bayt/__init__.py index 12b375e..0fd29e9 100644 --- a/src/jobspy/scrapers/bayt/__init__.py +++ b/jobspy/bayt/__init__.py @@ -1,10 +1,3 @@ -""" -jobspy.scrapers.bayt -~~~~~~~~~~~~~~~~~~~ - -This module contains routines to scrape Bayt. -""" - from __future__ import annotations import random @@ -12,9 +5,16 @@ import time from bs4 import BeautifulSoup -from .. import Scraper, ScraperInput, Site -from ..utils import create_logger, create_session -from ...jobs import JobPost, JobResponse, Location, Country +from jobspy.model import ( + Scraper, + ScraperInput, + Site, + JobPost, + JobResponse, + Location, + Country, +) +from jobspy.util import create_logger, create_session log = create_logger("Bayt") diff --git a/src/jobspy/scrapers/exceptions.py b/jobspy/exception.py similarity index 97% rename from src/jobspy/scrapers/exceptions.py rename to jobspy/exception.py index ad63b06..b955a8f 100644 --- a/src/jobspy/scrapers/exceptions.py +++ b/jobspy/exception.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.exceptions +jobspy.jobboard.exceptions ~~~~~~~~~~~~~~~~~~~ This module contains the set of Scrapers' exceptions. diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/jobspy/glassdoor/__init__.py similarity index 83% rename from src/jobspy/scrapers/glassdoor/__init__.py rename to jobspy/glassdoor/__init__.py index 0455ec2..225d7fd 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/jobspy/glassdoor/__init__.py @@ -1,41 +1,38 @@ -""" -jobspy.scrapers.glassdoor -~~~~~~~~~~~~~~~~~~~ - -This module contains routines to scrape Glassdoor. -""" - from __future__ import annotations import re import json import requests -from typing import Optional, Tuple +from typing import Tuple from datetime import datetime, timedelta from concurrent.futures import ThreadPoolExecutor, as_completed -from .constants import fallback_token, query_template, headers -from .. import Scraper, ScraperInput, Site -from ..utils import extract_emails_from_text, create_logger -from ..exceptions import GlassdoorException -from ..utils import ( +from jobspy.glassdoor.constant import fallback_token, query_template, headers +from jobspy.glassdoor.util import ( + get_cursor_for_page, + parse_compensation, + parse_location, +) +from jobspy.util import ( + extract_emails_from_text, + create_logger, create_session, markdown_converter, ) -from ...jobs import ( +from jobspy.exception import GlassdoorException +from jobspy.model import ( JobPost, - Compensation, - CompensationInterval, - Location, JobResponse, - JobType, DescriptionFormat, + Scraper, + ScraperInput, + Site, ) log = create_logger("Glassdoor") -class GlassdoorScraper(Scraper): +class Glassdoor(Scraper): def __init__( self, proxies: list[str] | str | None = None, ca_cert: str | None = None ): @@ -146,7 +143,7 @@ class GlassdoorScraper(Scraper): except Exception as exc: raise GlassdoorException(f"Glassdoor generated an exception: {exc}") - return jobs, self.get_cursor_for_page( + return jobs, get_cursor_for_page( res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 ) @@ -185,9 +182,9 @@ class GlassdoorScraper(Scraper): if location_type == "S": is_remote = True else: - location = self.parse_location(location_name) + location = parse_location(location_name) - compensation = self.parse_compensation(job["header"]) + compensation = parse_compensation(job["header"]) try: description = self._fetch_job_description(job_id) except: @@ -321,44 +318,3 @@ class GlassdoorScraper(Scraper): {"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]} ) return json.dumps([payload]) - - @staticmethod - def parse_compensation(data: dict) -> Optional[Compensation]: - pay_period = data.get("payPeriod") - adjusted_pay = data.get("payPeriodAdjustedPay") - currency = data.get("payCurrency", "USD") - if not pay_period or not adjusted_pay: - return None - - interval = None - if pay_period == "ANNUAL": - interval = CompensationInterval.YEARLY - elif pay_period: - interval = CompensationInterval.get_interval(pay_period) - min_amount = int(adjusted_pay.get("p10") // 1) - max_amount = int(adjusted_pay.get("p90") // 1) - return Compensation( - interval=interval, - min_amount=min_amount, - max_amount=max_amount, - currency=currency, - ) - - @staticmethod - def get_job_type_enum(job_type_str: str) -> list[JobType] | None: - for job_type in JobType: - if job_type_str in job_type.value: - return [job_type] - - @staticmethod - def parse_location(location_name: str) -> Location | None: - if not location_name or location_name == "Remote": - return - city, _, state = location_name.partition(", ") - return Location(city=city, state=state) - - @staticmethod - def get_cursor_for_page(pagination_cursors, page_num): - for cursor_data in pagination_cursors: - if cursor_data["pageNumber"] == page_num: - return cursor_data["cursor"] diff --git a/src/jobspy/scrapers/glassdoor/constants.py b/jobspy/glassdoor/constant.py similarity index 100% rename from src/jobspy/scrapers/glassdoor/constants.py rename to jobspy/glassdoor/constant.py diff --git a/jobspy/glassdoor/util.py b/jobspy/glassdoor/util.py new file mode 100644 index 0000000..c52664e --- /dev/null +++ b/jobspy/glassdoor/util.py @@ -0,0 +1,42 @@ +from jobspy.model import Compensation, CompensationInterval, Location, JobType + + +def parse_compensation(data: dict) -> Compensation | None: + pay_period = data.get("payPeriod") + adjusted_pay = data.get("payPeriodAdjustedPay") + currency = data.get("payCurrency", "USD") + if not pay_period or not adjusted_pay: + return None + + interval = None + if pay_period == "ANNUAL": + interval = CompensationInterval.YEARLY + elif pay_period: + interval = CompensationInterval.get_interval(pay_period) + min_amount = int(adjusted_pay.get("p10") // 1) + max_amount = int(adjusted_pay.get("p90") // 1) + return Compensation( + interval=interval, + min_amount=min_amount, + max_amount=max_amount, + currency=currency, + ) + + +def get_job_type_enum(job_type_str: str) -> list[JobType] | None: + for job_type in JobType: + if job_type_str in job_type.value: + return [job_type] + + +def parse_location(location_name: str) -> Location | None: + if not location_name or location_name == "Remote": + return + city, _, state = location_name.partition(", ") + return Location(city=city, state=state) + + +def get_cursor_for_page(pagination_cursors, page_num): + for cursor_data in pagination_cursors: + if cursor_data["pageNumber"] == page_num: + return cursor_data["cursor"] diff --git a/src/jobspy/scrapers/google/__init__.py b/jobspy/google/__init__.py similarity index 78% rename from src/jobspy/scrapers/google/__init__.py rename to jobspy/google/__init__.py index 0c47278..e77903c 100644 --- a/src/jobspy/scrapers/google/__init__.py +++ b/jobspy/google/__init__.py @@ -1,10 +1,3 @@ -""" -jobspy.scrapers.google -~~~~~~~~~~~~~~~~~~~ - -This module contains routines to scrape Google. -""" - from __future__ import annotations import math @@ -13,23 +6,21 @@ import json from typing import Tuple from datetime import datetime, timedelta -from .constants import headers_jobs, headers_initial, async_param -from .. import Scraper, ScraperInput, Site -from ..utils import extract_emails_from_text, create_logger, extract_job_type -from ..utils import ( - create_session, -) -from ...jobs import ( +from jobspy.google.constant import headers_jobs, headers_initial, async_param +from jobspy.model import ( + Scraper, + ScraperInput, + Site, JobPost, JobResponse, Location, JobType, ) - -log = create_logger("Google") +from jobspy.util import extract_emails_from_text, extract_job_type, create_session +from jobspy.google.util import log, find_job_info_initial_page, find_job_info -class GoogleJobsScraper(Scraper): +class Google(Scraper): def __init__( self, proxies: list[str] | str | None = None, ca_cert: str | None = None ): @@ -135,7 +126,7 @@ class GoogleJobsScraper(Scraper): pattern_fc = r'
]+data-async-fc="([^"]+)"' match_fc = re.search(pattern_fc, response.text) data_async_fc = match_fc.group(1) if match_fc else None - jobs_raw = self._find_job_info_initial_page(response.text) + jobs_raw = find_job_info_initial_page(response.text) jobs = [] for job_raw in jobs_raw: job_post = self._parse_job(job_raw) @@ -167,7 +158,7 @@ class GoogleJobsScraper(Scraper): continue job_d = json.loads(job_data) - job_info = self._find_job_info(job_d) + job_info = find_job_info(job_d) job_post = self._parse_job(job_info) if job_post: jobs_on_page.append(job_post) @@ -209,39 +200,3 @@ class GoogleJobsScraper(Scraper): job_type=extract_job_type(description), ) return job_post - - @staticmethod - def _find_job_info(jobs_data: list | dict) -> list | None: - """Iterates through the JSON data to find the job listings""" - if isinstance(jobs_data, dict): - for key, value in jobs_data.items(): - if key == "520084652" and isinstance(value, list): - return value - else: - result = GoogleJobsScraper._find_job_info(value) - if result: - return result - elif isinstance(jobs_data, list): - for item in jobs_data: - result = GoogleJobsScraper._find_job_info(item) - if result: - return result - return None - - @staticmethod - def _find_job_info_initial_page(html_text: str): - pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]" - results = [] - matches = re.finditer(pattern, html_text) - - import json - - for match in matches: - try: - parsed_data = json.loads(match.group(1)) - results.append(parsed_data) - - except json.JSONDecodeError as e: - log.error(f"Failed to parse match: {str(e)}") - results.append({"raw_match": match.group(0), "error": str(e)}) - return results diff --git a/src/jobspy/scrapers/google/constants.py b/jobspy/google/constant.py similarity index 100% rename from src/jobspy/scrapers/google/constants.py rename to jobspy/google/constant.py diff --git a/jobspy/google/util.py b/jobspy/google/util.py new file mode 100644 index 0000000..89c059b --- /dev/null +++ b/jobspy/google/util.py @@ -0,0 +1,41 @@ +import re + +from jobspy.util import create_logger + +log = create_logger("Google") + + +def find_job_info(jobs_data: list | dict) -> list | None: + """Iterates through the JSON data to find the job listings""" + if isinstance(jobs_data, dict): + for key, value in jobs_data.items(): + if key == "520084652" and isinstance(value, list): + return value + else: + result = find_job_info(value) + if result: + return result + elif isinstance(jobs_data, list): + for item in jobs_data: + result = find_job_info(item) + if result: + return result + return None + + +def find_job_info_initial_page(html_text: str): + pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]" + results = [] + matches = re.finditer(pattern, html_text) + + import json + + for match in matches: + try: + parsed_data = json.loads(match.group(1)) + results.append(parsed_data) + + except json.JSONDecodeError as e: + log.error(f"Failed to parse match: {str(e)}") + results.append({"raw_match": match.group(0), "error": str(e)}) + return results diff --git a/src/jobspy/scrapers/indeed/__init__.py b/jobspy/indeed/__init__.py similarity index 70% rename from src/jobspy/scrapers/indeed/__init__.py rename to jobspy/indeed/__init__.py index b9235ae..adbc9e9 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/jobspy/indeed/__init__.py @@ -1,39 +1,32 @@ -""" -jobspy.scrapers.indeed -~~~~~~~~~~~~~~~~~~~ - -This module contains routines to scrape Indeed. -""" - from __future__ import annotations import math -from typing import Tuple from datetime import datetime +from typing import Tuple -from .constants import job_search_query, api_headers -from .. import Scraper, ScraperInput, Site -from ..utils import ( - extract_emails_from_text, - get_enum_from_job_type, - markdown_converter, - create_session, - create_logger, -) -from ...jobs import ( +from jobspy.indeed.constant import job_search_query, api_headers +from jobspy.indeed.util import is_job_remote, get_compensation, get_job_type +from jobspy.model import ( + Scraper, + ScraperInput, + Site, JobPost, - Compensation, - CompensationInterval, Location, JobResponse, JobType, DescriptionFormat, ) +from jobspy.util import ( + extract_emails_from_text, + markdown_converter, + create_session, + create_logger, +) log = create_logger("Indeed") -class IndeedScraper(Scraper): +class Indeed(Scraper): def __init__( self, proxies: list[str] | str | None = None, ca_cert: str | None = None ): @@ -213,7 +206,7 @@ class IndeedScraper(Scraper): if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: description = markdown_converter(description) - job_type = self._get_job_type(job["attributes"]) + job_type = get_job_type(job["attributes"]) timestamp_seconds = job["datePublished"] / 1000 date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d") employer = job["employer"].get("dossier") if job["employer"] else None @@ -234,14 +227,14 @@ class IndeedScraper(Scraper): country=job.get("location", {}).get("countryCode"), ), job_type=job_type, - compensation=self._get_compensation(job["compensation"]), + compensation=get_compensation(job["compensation"]), date_posted=date_posted, job_url=job_url, job_url_direct=( job["recruit"].get("viewJobUrl") if job.get("recruit") else None ), emails=extract_emails_from_text(description) if description else None, - is_remote=self._is_job_remote(job, description), + is_remote=is_job_remote(job, description), company_addresses=( employer_details["addresses"][0] if employer_details.get("addresses") @@ -265,86 +258,3 @@ class IndeedScraper(Scraper): else None ), ) - - @staticmethod - def _get_job_type(attributes: list) -> list[JobType]: - """ - Parses the attributes to get list of job types - :param attributes: - :return: list of JobType - """ - job_types: list[JobType] = [] - for attribute in attributes: - job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower() - job_type = get_enum_from_job_type(job_type_str) - if job_type: - job_types.append(job_type) - return job_types - - @staticmethod - def _get_compensation(compensation: dict) -> Compensation | None: - """ - Parses the job to get compensation - :param job: - :return: compensation object - """ - if not compensation["baseSalary"] and not compensation["estimated"]: - return None - comp = ( - compensation["baseSalary"] - if compensation["baseSalary"] - else compensation["estimated"]["baseSalary"] - ) - if not comp: - return None - interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"]) - if not interval: - return None - min_range = comp["range"].get("min") - max_range = comp["range"].get("max") - return Compensation( - interval=interval, - min_amount=int(min_range) if min_range is not None else None, - max_amount=int(max_range) if max_range is not None else None, - currency=( - compensation["estimated"]["currencyCode"] - if compensation["estimated"] - else compensation["currencyCode"] - ), - ) - - @staticmethod - def _is_job_remote(job: dict, description: str) -> bool: - """ - Searches the description, location, and attributes to check if job is remote - """ - remote_keywords = ["remote", "work from home", "wfh"] - is_remote_in_attributes = any( - any(keyword in attr["label"].lower() for keyword in remote_keywords) - for attr in job["attributes"] - ) - is_remote_in_description = any( - keyword in description.lower() for keyword in remote_keywords - ) - is_remote_in_location = any( - keyword in job["location"]["formatted"]["long"].lower() - for keyword in remote_keywords - ) - return ( - is_remote_in_attributes or is_remote_in_description or is_remote_in_location - ) - - @staticmethod - def _get_compensation_interval(interval: str) -> CompensationInterval: - interval_mapping = { - "DAY": "DAILY", - "YEAR": "YEARLY", - "HOUR": "HOURLY", - "WEEK": "WEEKLY", - "MONTH": "MONTHLY", - } - mapped_interval = interval_mapping.get(interval.upper(), None) - if mapped_interval and mapped_interval in CompensationInterval.__members__: - return CompensationInterval[mapped_interval] - else: - raise ValueError(f"Unsupported interval: {interval}") diff --git a/src/jobspy/scrapers/indeed/constants.py b/jobspy/indeed/constant.py similarity index 100% rename from src/jobspy/scrapers/indeed/constants.py rename to jobspy/indeed/constant.py diff --git a/jobspy/indeed/util.py b/jobspy/indeed/util.py new file mode 100644 index 0000000..5515ed1 --- /dev/null +++ b/jobspy/indeed/util.py @@ -0,0 +1,83 @@ +from jobspy.model import CompensationInterval, JobType, Compensation +from jobspy.util import get_enum_from_job_type + + +def get_job_type(attributes: list) -> list[JobType]: + """ + Parses the attributes to get list of job types + :param attributes: + :return: list of JobType + """ + job_types: list[JobType] = [] + for attribute in attributes: + job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower() + job_type = get_enum_from_job_type(job_type_str) + if job_type: + job_types.append(job_type) + return job_types + + +def get_compensation(compensation: dict) -> Compensation | None: + """ + Parses the job to get compensation + :param sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrompensation: + :return: compensation object + """ + if not compensation["baseSalary"] and not compensation["estimated"]: + return None + comp = ( + compensation["baseSalary"] + if compensation["baseSalary"] + else compensation["estimated"]["baseSalary"] + ) + if not comp: + return None + interval = get_compensation_interval(comp["unitOfWork"]) + if not interval: + return None + min_range = comp["range"].get("min") + max_range = comp["range"].get("max") + return Compensation( + interval=interval, + min_amount=int(min_range) if min_range is not None else None, + max_amount=int(max_range) if max_range is not None else None, + currency=( + compensation["estimated"]["currencyCode"] + if compensation["estimated"] + else compensation["currencyCode"] + ), + ) + + +def is_job_remote(job: dict, description: str) -> bool: + """ + Searches the description, location, and attributes to check if job is remote + """ + remote_keywords = ["remote", "work from home", "wfh"] + is_remote_in_attributes = any( + any(keyword in attr["label"].lower() for keyword in remote_keywords) + for attr in job["attributes"] + ) + is_remote_in_description = any( + keyword in description.lower() for keyword in remote_keywords + ) + is_remote_in_location = any( + keyword in job["location"]["formatted"]["long"].lower() + for keyword in remote_keywords + ) + return is_remote_in_attributes or is_remote_in_description or is_remote_in_location + + +def get_compensation_interval(interval: str) -> CompensationInterval: + interval_mapping = { + "DAY": "DAILY", + "YEAR": "YEARLY", + "HOUR": "HOURLY", + "WEEK": "WEEKLY", + "MONTH": "MONTHLY", + } + mapped_interval = interval_mapping.get(interval.upper(), None) + if mapped_interval and mapped_interval in CompensationInterval.__members__: + return CompensationInterval[mapped_interval] + else: + raise ValueError(f"Unsupported interval: {interval}") diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/jobspy/linkedin/__init__.py similarity index 77% rename from src/jobspy/scrapers/linkedin/__init__.py rename to jobspy/linkedin/__init__.py index d854cbe..db20f12 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/jobspy/linkedin/__init__.py @@ -1,47 +1,48 @@ -""" -jobspy.scrapers.linkedin -~~~~~~~~~~~~~~~~~~~ - -This module contains routines to scrape LinkedIn. -""" - from __future__ import annotations import math -import time import random -import regex as re -from typing import Optional +import time from datetime import datetime - -from bs4.element import Tag -from bs4 import BeautifulSoup +from typing import Optional from urllib.parse import urlparse, urlunparse, unquote -from .constants import headers -from .. import Scraper, ScraperInput, Site -from ..exceptions import LinkedInException -from ..utils import create_session, remove_attributes, create_logger -from ...jobs import ( +import regex as re +from bs4 import BeautifulSoup +from bs4.element import Tag + +from jobspy.exception import LinkedInException +from jobspy.linkedin.constant import headers +from jobspy.linkedin.util import ( + job_type_code, + parse_job_type, + parse_job_level, + parse_company_industry, +) +from jobspy.model import ( JobPost, Location, JobResponse, - JobType, Country, Compensation, DescriptionFormat, + Scraper, + ScraperInput, + Site, ) -from ..utils import ( +from jobspy.util import ( extract_emails_from_text, - get_enum_from_job_type, currency_parser, markdown_converter, + create_session, + remove_attributes, + create_logger, ) log = create_logger("LinkedIn") -class LinkedInScraper(Scraper): +class LinkedIn(Scraper): base_url = "https://www.linkedin.com" delay = 3 band_delay = 4 @@ -95,7 +96,7 @@ class LinkedInScraper(Scraper): "distance": scraper_input.distance, "f_WT": 2 if scraper_input.is_remote else None, "f_JT": ( - self.job_type_code(scraper_input.job_type) + job_type_code(scraper_input.job_type) if scraper_input.job_type else None ), @@ -282,9 +283,9 @@ class LinkedInScraper(Scraper): ) return { "description": description, - "job_level": self._parse_job_level(soup), - "company_industry": self._parse_company_industry(soup), - "job_type": self._parse_job_type(soup), + "job_level": parse_job_level(soup), + "company_industry": parse_company_industry(soup), + "job_type": parse_job_type(soup), "job_url_direct": self._parse_job_url_direct(soup), "company_logo": company_logo, "job_function": job_function, @@ -316,77 +317,6 @@ class LinkedInScraper(Scraper): location = Location(city=city, state=state, country=country) return location - @staticmethod - def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None: - """ - Gets the job type from job page - :param soup_job_type: - :return: JobType - """ - h3_tag = soup_job_type.find( - "h3", - class_="description__job-criteria-subheader", - string=lambda text: "Employment type" in text, - ) - employment_type = None - if h3_tag: - employment_type_span = h3_tag.find_next_sibling( - "span", - class_="description__job-criteria-text description__job-criteria-text--criteria", - ) - if employment_type_span: - employment_type = employment_type_span.get_text(strip=True) - employment_type = employment_type.lower() - employment_type = employment_type.replace("-", "") - - return [get_enum_from_job_type(employment_type)] if employment_type else [] - - @staticmethod - def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None: - """ - Gets the job level from job page - :param soup_job_level: - :return: str - """ - h3_tag = soup_job_level.find( - "h3", - class_="description__job-criteria-subheader", - string=lambda text: "Seniority level" in text, - ) - job_level = None - if h3_tag: - job_level_span = h3_tag.find_next_sibling( - "span", - class_="description__job-criteria-text description__job-criteria-text--criteria", - ) - if job_level_span: - job_level = job_level_span.get_text(strip=True) - - return job_level - - @staticmethod - def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None: - """ - Gets the company industry from job page - :param soup_industry: - :return: str - """ - h3_tag = soup_industry.find( - "h3", - class_="description__job-criteria-subheader", - string=lambda text: "Industries" in text, - ) - industry = None - if h3_tag: - industry_span = h3_tag.find_next_sibling( - "span", - class_="description__job-criteria-text description__job-criteria-text--criteria", - ) - if industry_span: - industry = industry_span.get_text(strip=True) - - return industry - def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None: """ Gets the job url direct from job page @@ -403,13 +333,3 @@ class LinkedInScraper(Scraper): job_url_direct = unquote(job_url_direct_match.group()) return job_url_direct - - @staticmethod - def job_type_code(job_type_enum: JobType) -> str: - return { - JobType.FULL_TIME: "F", - JobType.PART_TIME: "P", - JobType.INTERNSHIP: "I", - JobType.CONTRACT: "C", - JobType.TEMPORARY: "T", - }.get(job_type_enum, "") diff --git a/src/jobspy/scrapers/linkedin/constants.py b/jobspy/linkedin/constant.py similarity index 100% rename from src/jobspy/scrapers/linkedin/constants.py rename to jobspy/linkedin/constant.py diff --git a/jobspy/linkedin/util.py b/jobspy/linkedin/util.py new file mode 100644 index 0000000..fe37c48 --- /dev/null +++ b/jobspy/linkedin/util.py @@ -0,0 +1,85 @@ +from bs4 import BeautifulSoup + +from jobspy.model import JobType +from jobspy.util import get_enum_from_job_type + + +def job_type_code(job_type_enum: JobType) -> str: + return { + JobType.FULL_TIME: "F", + JobType.PART_TIME: "P", + JobType.INTERNSHIP: "I", + JobType.CONTRACT: "C", + JobType.TEMPORARY: "T", + }.get(job_type_enum, "") + + +def parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None: + """ + Gets the job type from job page + :param soup_job_type: + :return: JobType + """ + h3_tag = soup_job_type.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Employment type" in text, + ) + employment_type = None + if h3_tag: + employment_type_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if employment_type_span: + employment_type = employment_type_span.get_text(strip=True) + employment_type = employment_type.lower() + employment_type = employment_type.replace("-", "") + + return [get_enum_from_job_type(employment_type)] if employment_type else [] + + +def parse_job_level(soup_job_level: BeautifulSoup) -> str | None: + """ + Gets the job level from job page + :param soup_job_level: + :return: str + """ + h3_tag = soup_job_level.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Seniority level" in text, + ) + job_level = None + if h3_tag: + job_level_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if job_level_span: + job_level = job_level_span.get_text(strip=True) + + return job_level + + +def parse_company_industry(soup_industry: BeautifulSoup) -> str | None: + """ + Gets the company industry from job page + :param soup_industry: + :return: str + """ + h3_tag = soup_industry.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Industries" in text, + ) + industry = None + if h3_tag: + industry_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if industry_span: + industry = industry_span.get_text(strip=True) + + return industry diff --git a/src/jobspy/jobs/__init__.py b/jobspy/model.py similarity index 87% rename from src/jobspy/jobs/__init__.py rename to jobspy/model.py index c51839c..b11e163 100644 --- a/src/jobspy/jobs/__init__.py +++ b/jobspy/model.py @@ -1,5 +1,6 @@ from __future__ import annotations +from abc import ABC, abstractmethod from typing import Optional from datetime import date from enum import Enum @@ -265,3 +266,49 @@ class JobPost(BaseModel): class JobResponse(BaseModel): jobs: list[JobPost] = [] + + +class Site(Enum): + LINKEDIN = "linkedin" + INDEED = "indeed" + ZIP_RECRUITER = "zip_recruiter" + GLASSDOOR = "glassdoor" + GOOGLE = "google" + BAYT = "bayt" + + +class SalarySource(Enum): + DIRECT_DATA = "direct_data" + DESCRIPTION = "description" + + +class ScraperInput(BaseModel): + site_type: list[Site] + search_term: str | None = None + google_search_term: str | None = None + + location: str | None = None + country: Country | None = Country.USA + distance: int | None = None + is_remote: bool = False + job_type: JobType | None = None + easy_apply: bool | None = None + offset: int = 0 + linkedin_fetch_description: bool = False + linkedin_company_ids: list[int] | None = None + description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN + + results_wanted: int = 15 + hours_old: int | None = None + + +class Scraper(ABC): + def __init__( + self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None + ): + self.site = site + self.proxies = proxies + self.ca_cert = ca_cert + + @abstractmethod + def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/src/jobspy/scrapers/utils.py b/jobspy/util.py similarity index 86% rename from src/jobspy/scrapers/utils.py rename to jobspy/util.py index 32e0663..36c13e7 100644 --- a/src/jobspy/scrapers/utils.py +++ b/jobspy/util.py @@ -11,7 +11,7 @@ import urllib3 from markdownify import markdownify as md from requests.adapters import HTTPAdapter, Retry -from ..jobs import CompensationInterval, JobType +from jobspy.model import CompensationInterval, JobType, Site urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -286,3 +286,62 @@ def extract_job_type(description: str): listing_types.append(key) return listing_types if listing_types else None + + +def map_str_to_site(site_name: str) -> Site: + return Site[site_name.upper()] + + +def get_enum_from_value(value_str): + for job_type in JobType: + if value_str in job_type.value: + return job_type + raise Exception(f"Invalid job type: {value_str}") + + +def convert_to_annual(job_data: dict): + if job_data["interval"] == "hourly": + job_data["min_amount"] *= 2080 + job_data["max_amount"] *= 2080 + if job_data["interval"] == "monthly": + job_data["min_amount"] *= 12 + job_data["max_amount"] *= 12 + if job_data["interval"] == "weekly": + job_data["min_amount"] *= 52 + job_data["max_amount"] *= 52 + if job_data["interval"] == "daily": + job_data["min_amount"] *= 260 + job_data["max_amount"] *= 260 + job_data["interval"] = "yearly" + + +desired_order = [ + "id", + "site", + "job_url", + "job_url_direct", + "title", + "company", + "location", + "date_posted", + "job_type", + "salary_source", + "interval", + "min_amount", + "max_amount", + "currency", + "is_remote", + "job_level", + "job_function", + "listing_type", + "emails", + "description", + "company_industry", + "company_url", + "company_logo", + "company_url_direct", + "company_addresses", + "company_num_employees", + "company_revenue", + "company_description", +] diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/jobspy/ziprecruiter/__init__.py similarity index 73% rename from src/jobspy/scrapers/ziprecruiter/__init__.py rename to jobspy/ziprecruiter/__init__.py index 816331e..c91ba57 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/jobspy/ziprecruiter/__init__.py @@ -1,10 +1,3 @@ -""" -jobspy.scrapers.ziprecruiter -~~~~~~~~~~~~~~~~~~~ - -This module contains routines to scrape ZipRecruiter. -""" - from __future__ import annotations import json @@ -13,33 +6,34 @@ import re import time from concurrent.futures import ThreadPoolExecutor from datetime import datetime -from typing import Optional, Tuple, Any from bs4 import BeautifulSoup -from .constants import headers -from .. import Scraper, ScraperInput, Site -from ..utils import ( +from jobspy.ziprecruiter.constant import headers, get_cookie_data +from jobspy.util import ( extract_emails_from_text, create_session, markdown_converter, remove_attributes, create_logger, ) -from ...jobs import ( +from jobspy.model import ( JobPost, Compensation, Location, JobResponse, - JobType, Country, DescriptionFormat, + Scraper, + ScraperInput, + Site, ) +from jobspy.ziprecruiter.util import get_job_type_enum, add_params log = create_logger("ZipRecruiter") -class ZipRecruiterScraper(Scraper): +class ZipRecruiter(Scraper): base_url = "https://www.ziprecruiter.com" api_url = "https://api.ziprecruiter.com" @@ -90,7 +84,7 @@ class ZipRecruiterScraper(Scraper): def _find_jobs_in_page( self, scraper_input: ScraperInput, continue_token: str | None = None - ) -> Tuple[list[JobPost], Optional[str]]: + ) -> tuple[list[JobPost], str | None]: """ Scrapes a page of ZipRecruiter for jobs with scraper_input criteria :param scraper_input: @@ -98,7 +92,7 @@ class ZipRecruiterScraper(Scraper): :return: jobs found on page """ jobs_list = [] - params = self._add_params(scraper_input) + params = add_params(scraper_input) if continue_token: params["continue_from"] = continue_token try: @@ -151,7 +145,7 @@ class ZipRecruiterScraper(Scraper): location = Location( city=job.get("job_city"), state=job.get("job_state"), country=country_enum ) - job_type = self._get_job_type_enum( + job_type = get_job_type_enum( job.get("employment_type", "").replace("_", "").lower() ) date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date() @@ -200,13 +194,17 @@ class ZipRecruiterScraper(Scraper): else "" ) description_full = job_description_clean + company_description_clean - script_tag = soup.find("script", type="application/json") - if script_tag: - job_json = json.loads(script_tag.string) - job_url_val = job_json["model"].get("saveJobURL", "") - m = re.search(r"job_url=(.+)", job_url_val) - if m: - job_url_direct = m.group(1) + + try: + script_tag = soup.find("script", type="application/json") + if script_tag: + job_json = json.loads(script_tag.string) + job_url_val = job_json["model"].get("saveJobURL", "") + m = re.search(r"job_url=(.+)", job_url_val) + if m: + job_url_direct = m.group(1) + except: + job_url_direct = None if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: description_full = markdown_converter(description_full) @@ -217,51 +215,5 @@ class ZipRecruiterScraper(Scraper): """ Sends a session event to the API with device properties. """ - data = [ - ("event_type", "session"), - ("logged_in", "false"), - ("number_of_retry", "1"), - ("property", "model:iPhone"), - ("property", "os:iOS"), - ("property", "locale:en_us"), - ("property", "app_build_number:4734"), - ("property", "app_version:91.0"), - ("property", "manufacturer:Apple"), - ("property", "timestamp:2025-01-12T12:04:42-06:00"), - ("property", "screen_height:852"), - ("property", "os_version:16.6.1"), - ("property", "source:install"), - ("property", "screen_width:393"), - ("property", "device_model:iPhone 14 Pro"), - ("property", "brand:Apple"), - ] - url = f"{self.api_url}/jobs-app/event" - self.session.post(url, data=data) - - @staticmethod - def _get_job_type_enum(job_type_str: str) -> list[JobType] | None: - for job_type in JobType: - if job_type_str in job_type.value: - return [job_type] - return None - - @staticmethod - def _add_params(scraper_input) -> dict[str, str | Any]: - params = { - "search": scraper_input.search_term, - "location": scraper_input.location, - } - if scraper_input.hours_old: - params["days"] = max(scraper_input.hours_old // 24, 1) - job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"} - if scraper_input.job_type: - job_type = scraper_input.job_type - params["employment_type"] = job_type_map.get(job_type, job_type.value[0]) - if scraper_input.easy_apply: - params["zipapply"] = 1 - if scraper_input.is_remote: - params["remote"] = 1 - if scraper_input.distance: - params["radius"] = scraper_input.distance - return {k: v for k, v in params.items() if v is not None} + self.session.post(url, data=get_cookie_data) diff --git a/jobspy/ziprecruiter/constant.py b/jobspy/ziprecruiter/constant.py new file mode 100644 index 0000000..2bf8371 --- /dev/null +++ b/jobspy/ziprecruiter/constant.py @@ -0,0 +1,29 @@ +headers = { + "Host": "api.ziprecruiter.com", + "accept": "*/*", + "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", + "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", + "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006", + "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)", + "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==", + "accept-language": "en-US,en;q=0.9", +} + +get_cookie_data = [ + ("event_type", "session"), + ("logged_in", "false"), + ("number_of_retry", "1"), + ("property", "model:iPhone"), + ("property", "os:iOS"), + ("property", "locale:en_us"), + ("property", "app_build_number:4734"), + ("property", "app_version:91.0"), + ("property", "manufacturer:Apple"), + ("property", "timestamp:2025-01-12T12:04:42-06:00"), + ("property", "screen_height:852"), + ("property", "os_version:16.6.1"), + ("property", "source:install"), + ("property", "screen_width:393"), + ("property", "device_model:iPhone 14 Pro"), + ("property", "brand:Apple"), +] diff --git a/jobspy/ziprecruiter/util.py b/jobspy/ziprecruiter/util.py new file mode 100644 index 0000000..ba2f39d --- /dev/null +++ b/jobspy/ziprecruiter/util.py @@ -0,0 +1,31 @@ +from jobspy.model import JobType + + +def add_params(scraper_input) -> dict[str, str | int]: + params: dict[str, str | int] = { + "search": scraper_input.search_term, + "location": scraper_input.location, + } + if scraper_input.hours_old: + params["days"] = max(scraper_input.hours_old // 24, 1) + + job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"} + if scraper_input.job_type: + job_type = scraper_input.job_type + params["employment_type"] = job_type_map.get(job_type, job_type.value[0]) + + if scraper_input.easy_apply: + params["zipapply"] = 1 + if scraper_input.is_remote: + params["remote"] = 1 + if scraper_input.distance: + params["radius"] = scraper_input.distance + + return {k: v for k, v in params.items() if v is not None} + + +def get_job_type_enum(job_type_str: str) -> list[JobType] | None: + for job_type in JobType: + if job_type_str in job_type.value: + return [job_type] + return None diff --git a/pyproject.toml b/pyproject.toml index c4275a7..3fad5fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,15 +4,14 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "python-jobspy" -version = "1.1.76" -description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" -authors = [ "Zachary Hampton ", "Cullen Watson ",] -homepage = "https://github.com/Bunsly/JobSpy" +version = "1.1.77" +description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt" +authors = ["Cullen Watson ", "Zachary Hampton "] +homepage = "https://github.com/cullenwatson/JobSpy" readme = "README.md" -keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",] +keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt"] [[tool.poetry.packages]] include = "jobspy" -from = "src" [tool.black] line-length = 88 @@ -29,7 +28,6 @@ markdownify = "^0.13.1" regex = "^2024.4.28" [tool.poetry.group.dev.dependencies] -pytest = "^7.4.1" jupyter = "^1.0.0" black = "*" pre-commit = "*" diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py deleted file mode 100644 index 63c00d5..0000000 --- a/src/jobspy/scrapers/__init__.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod - -from ..jobs import ( - Enum, - BaseModel, - JobType, - JobResponse, - Country, - DescriptionFormat, -) - - -class Site(Enum): - LINKEDIN = "linkedin" - INDEED = "indeed" - ZIP_RECRUITER = "zip_recruiter" - GLASSDOOR = "glassdoor" - GOOGLE = "google" - BAYT = "bayt" - - -class SalarySource(Enum): - DIRECT_DATA = "direct_data" - DESCRIPTION = "description" - - -class ScraperInput(BaseModel): - site_type: list[Site] - search_term: str | None = None - google_search_term: str | None = None - - location: str | None = None - country: Country | None = Country.USA - distance: int | None = None - is_remote: bool = False - job_type: JobType | None = None - easy_apply: bool | None = None - offset: int = 0 - linkedin_fetch_description: bool = False - linkedin_company_ids: list[int] | None = None - description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN - - results_wanted: int = 15 - hours_old: int | None = None - - -class Scraper(ABC): - def __init__( - self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None - ): - self.site = site - self.proxies = proxies - self.ca_cert = ca_cert - - @abstractmethod - def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/src/jobspy/scrapers/ziprecruiter/constants.py b/src/jobspy/scrapers/ziprecruiter/constants.py deleted file mode 100644 index 7e179c9..0000000 --- a/src/jobspy/scrapers/ziprecruiter/constants.py +++ /dev/null @@ -1,10 +0,0 @@ -headers = { - "Host": "api.ziprecruiter.com", - "accept": "*/*", - "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", - "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", - "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006", - "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)", - "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==", - "accept-language": "en-US,en;q=0.9", -}