diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index 366165c..6653e87 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -1,50 +1,33 @@
-name: Publish Python 🐍 distributions 📦 to PyPI
-on:
- pull_request:
- types:
- - closed
-
-permissions:
- contents: write
+name: Publish JobSpy to PyPi
+on: push
jobs:
build-n-publish:
- name: Build and publish Python 🐍 distributions 📦 to PyPI
+ name: Build and publish JobSpy to PyPi
runs-on: ubuntu-latest
- if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main'
-
steps:
- uses: actions/checkout@v3
-
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- - name: Install dependencies
- run: pip install toml
-
- - name: Increment version
- run: python increment_version.py
-
- - name: Commit version increment
- run: |
- git config --global user.name 'github-actions'
- git config --global user.email 'github-actions@github.com'
- git add pyproject.toml
- git commit -m 'Increment version'
-
- - name: Push changes
- run: git push
-
- name: Install poetry
- run: pip install poetry --user
+ run: >-
+ python3 -m
+ pip install
+ poetry
+ --user
- name: Build distribution 📦
- run: poetry build
+ run: >-
+ python3 -m
+ poetry
+ build
- name: Publish distribution 📦 to PyPI
+ if: startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/increment_version.py b/increment_version.py
deleted file mode 100644
index f359bd7..0000000
--- a/increment_version.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import toml
-
-def increment_version(version):
- major, minor, patch = map(int, version.split('.'))
- patch += 1
- return f"{major}.{minor}.{patch}"
-
-# Load pyproject.toml
-with open('pyproject.toml', 'r') as file:
- pyproject = toml.load(file)
-
-# Increment the version
-current_version = pyproject['tool']['poetry']['version']
-new_version = increment_version(current_version)
-pyproject['tool']['poetry']['version'] = new_version
-
-# Save the updated pyproject.toml
-with open('pyproject.toml', 'w') as file:
- toml.dump(pyproject, file)
-
-print(f"Version updated from {current_version} to {new_version}")
diff --git a/src/jobspy/__init__.py b/jobspy/__init__.py
similarity index 70%
rename from src/jobspy/__init__.py
rename to jobspy/__init__.py
index 8183338..ab57849 100644
--- a/src/jobspy/__init__.py
+++ b/jobspy/__init__.py
@@ -1,25 +1,27 @@
from __future__ import annotations
-import pandas as pd
-from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Tuple
-from .jobs import JobType, Location
-from .scrapers.utils import set_logger_level, extract_salary, create_logger
-from .scrapers.indeed import IndeedScraper
-from .scrapers.ziprecruiter import ZipRecruiterScraper
-from .scrapers.glassdoor import GlassdoorScraper
-from .scrapers.google import GoogleJobsScraper
-from .scrapers.linkedin import LinkedInScraper
-from .scrapers.bayt import BaytScraper
-from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
-from .scrapers.exceptions import (
- LinkedInException,
- IndeedException,
- ZipRecruiterException,
- GlassdoorException,
- GoogleJobsException,
+import pandas as pd
+
+from jobspy.bayt import BaytScraper
+from jobspy.glassdoor import Glassdoor
+from jobspy.google import Google
+from jobspy.indeed import Indeed
+from jobspy.linkedin import LinkedIn
+from jobspy.model import JobType, Location, JobResponse, Country
+from jobspy.model import SalarySource, ScraperInput, Site
+from jobspy.util import (
+ set_logger_level,
+ extract_salary,
+ create_logger,
+ get_enum_from_value,
+ map_str_to_site,
+ convert_to_annual,
+ desired_order,
)
+from jobspy.ziprecruiter import ZipRecruiter
def scrape_jobs(
@@ -33,7 +35,6 @@ def scrape_jobs(
easy_apply: bool | None = None,
results_wanted: int = 15,
country_indeed: str = "usa",
- hyperlinks: bool = False,
proxies: list[str] | str | None = None,
ca_cert: str | None = None,
description_format: str = "markdown",
@@ -46,28 +47,18 @@ def scrape_jobs(
**kwargs,
) -> pd.DataFrame:
"""
- Simultaneously scrapes job data from multiple job sites.
- :return: pandas dataframe containing job data
+ Scrapes job data from job boards concurrently
+ :return: Pandas DataFrame containing job data
"""
SCRAPER_MAPPING = {
- Site.LINKEDIN: LinkedInScraper,
- Site.INDEED: IndeedScraper,
- Site.ZIP_RECRUITER: ZipRecruiterScraper,
- Site.GLASSDOOR: GlassdoorScraper,
- Site.GOOGLE: GoogleJobsScraper,
+ Site.LINKEDIN: LinkedIn,
+ Site.INDEED: Indeed,
+ Site.ZIP_RECRUITER: ZipRecruiter,
+ Site.GLASSDOOR: Glassdoor,
+ Site.GOOGLE: Google,
Site.BAYT: BaytScraper,
}
set_logger_level(verbose)
-
- def map_str_to_site(site_name: str) -> Site:
- return Site[site_name.upper()]
-
- def get_enum_from_value(value_str):
- for job_type in JobType:
- if value_str in job_type.value:
- return job_type
- raise Exception(f"Invalid job type: {value_str}")
-
job_type = get_enum_from_value(job_type) if job_type else None
def get_site_type():
@@ -127,28 +118,12 @@ def scrape_jobs(
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data
- def convert_to_annual(job_data: dict):
- if job_data["interval"] == "hourly":
- job_data["min_amount"] *= 2080
- job_data["max_amount"] *= 2080
- if job_data["interval"] == "monthly":
- job_data["min_amount"] *= 12
- job_data["max_amount"] *= 12
- if job_data["interval"] == "weekly":
- job_data["min_amount"] *= 52
- job_data["max_amount"] *= 52
- if job_data["interval"] == "daily":
- job_data["min_amount"] *= 260
- job_data["max_amount"] *= 260
- job_data["interval"] = "yearly"
-
jobs_dfs: list[pd.DataFrame] = []
for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs:
job_data = job.dict()
job_url = job_data["job_url"]
- job_data["job_url_hyper"] = f'{job_url}'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
job_data["job_type"] = (
@@ -211,38 +186,6 @@ def scrape_jobs(
# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
- # Desired column order
- desired_order = [
- "id",
- "site",
- "job_url_hyper" if hyperlinks else "job_url",
- "job_url_direct",
- "title",
- "company",
- "location",
- "date_posted",
- "job_type",
- "salary_source",
- "interval",
- "min_amount",
- "max_amount",
- "currency",
- "is_remote",
- "job_level",
- "job_function",
- "listing_type",
- "emails",
- "description",
- "company_industry",
- "company_url",
- "company_logo",
- "company_url_direct",
- "company_addresses",
- "company_num_employees",
- "company_revenue",
- "company_description",
- ]
-
# Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order:
if column not in jobs_df.columns:
diff --git a/src/jobspy/scrapers/bayt/__init__.py b/jobspy/bayt/__init__.py
similarity index 95%
rename from src/jobspy/scrapers/bayt/__init__.py
rename to jobspy/bayt/__init__.py
index 12b375e..0fd29e9 100644
--- a/src/jobspy/scrapers/bayt/__init__.py
+++ b/jobspy/bayt/__init__.py
@@ -1,10 +1,3 @@
-"""
-jobspy.scrapers.bayt
-~~~~~~~~~~~~~~~~~~~
-
-This module contains routines to scrape Bayt.
-"""
-
from __future__ import annotations
import random
@@ -12,9 +5,16 @@ import time
from bs4 import BeautifulSoup
-from .. import Scraper, ScraperInput, Site
-from ..utils import create_logger, create_session
-from ...jobs import JobPost, JobResponse, Location, Country
+from jobspy.model import (
+ Scraper,
+ ScraperInput,
+ Site,
+ JobPost,
+ JobResponse,
+ Location,
+ Country,
+)
+from jobspy.util import create_logger, create_session
log = create_logger("Bayt")
diff --git a/src/jobspy/scrapers/exceptions.py b/jobspy/exception.py
similarity index 97%
rename from src/jobspy/scrapers/exceptions.py
rename to jobspy/exception.py
index ad63b06..b955a8f 100644
--- a/src/jobspy/scrapers/exceptions.py
+++ b/jobspy/exception.py
@@ -1,5 +1,5 @@
"""
-jobspy.scrapers.exceptions
+jobspy.jobboard.exceptions
~~~~~~~~~~~~~~~~~~~
This module contains the set of Scrapers' exceptions.
diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/jobspy/glassdoor/__init__.py
similarity index 83%
rename from src/jobspy/scrapers/glassdoor/__init__.py
rename to jobspy/glassdoor/__init__.py
index 0455ec2..225d7fd 100644
--- a/src/jobspy/scrapers/glassdoor/__init__.py
+++ b/jobspy/glassdoor/__init__.py
@@ -1,41 +1,38 @@
-"""
-jobspy.scrapers.glassdoor
-~~~~~~~~~~~~~~~~~~~
-
-This module contains routines to scrape Glassdoor.
-"""
-
from __future__ import annotations
import re
import json
import requests
-from typing import Optional, Tuple
+from typing import Tuple
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
-from .constants import fallback_token, query_template, headers
-from .. import Scraper, ScraperInput, Site
-from ..utils import extract_emails_from_text, create_logger
-from ..exceptions import GlassdoorException
-from ..utils import (
+from jobspy.glassdoor.constant import fallback_token, query_template, headers
+from jobspy.glassdoor.util import (
+ get_cursor_for_page,
+ parse_compensation,
+ parse_location,
+)
+from jobspy.util import (
+ extract_emails_from_text,
+ create_logger,
create_session,
markdown_converter,
)
-from ...jobs import (
+from jobspy.exception import GlassdoorException
+from jobspy.model import (
JobPost,
- Compensation,
- CompensationInterval,
- Location,
JobResponse,
- JobType,
DescriptionFormat,
+ Scraper,
+ ScraperInput,
+ Site,
)
log = create_logger("Glassdoor")
-class GlassdoorScraper(Scraper):
+class Glassdoor(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
@@ -146,7 +143,7 @@ class GlassdoorScraper(Scraper):
except Exception as exc:
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
- return jobs, self.get_cursor_for_page(
+ return jobs, get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
)
@@ -185,9 +182,9 @@ class GlassdoorScraper(Scraper):
if location_type == "S":
is_remote = True
else:
- location = self.parse_location(location_name)
+ location = parse_location(location_name)
- compensation = self.parse_compensation(job["header"])
+ compensation = parse_compensation(job["header"])
try:
description = self._fetch_job_description(job_id)
except:
@@ -321,44 +318,3 @@ class GlassdoorScraper(Scraper):
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
-
- @staticmethod
- def parse_compensation(data: dict) -> Optional[Compensation]:
- pay_period = data.get("payPeriod")
- adjusted_pay = data.get("payPeriodAdjustedPay")
- currency = data.get("payCurrency", "USD")
- if not pay_period or not adjusted_pay:
- return None
-
- interval = None
- if pay_period == "ANNUAL":
- interval = CompensationInterval.YEARLY
- elif pay_period:
- interval = CompensationInterval.get_interval(pay_period)
- min_amount = int(adjusted_pay.get("p10") // 1)
- max_amount = int(adjusted_pay.get("p90") // 1)
- return Compensation(
- interval=interval,
- min_amount=min_amount,
- max_amount=max_amount,
- currency=currency,
- )
-
- @staticmethod
- def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
- for job_type in JobType:
- if job_type_str in job_type.value:
- return [job_type]
-
- @staticmethod
- def parse_location(location_name: str) -> Location | None:
- if not location_name or location_name == "Remote":
- return
- city, _, state = location_name.partition(", ")
- return Location(city=city, state=state)
-
- @staticmethod
- def get_cursor_for_page(pagination_cursors, page_num):
- for cursor_data in pagination_cursors:
- if cursor_data["pageNumber"] == page_num:
- return cursor_data["cursor"]
diff --git a/src/jobspy/scrapers/glassdoor/constants.py b/jobspy/glassdoor/constant.py
similarity index 100%
rename from src/jobspy/scrapers/glassdoor/constants.py
rename to jobspy/glassdoor/constant.py
diff --git a/jobspy/glassdoor/util.py b/jobspy/glassdoor/util.py
new file mode 100644
index 0000000..c52664e
--- /dev/null
+++ b/jobspy/glassdoor/util.py
@@ -0,0 +1,42 @@
+from jobspy.model import Compensation, CompensationInterval, Location, JobType
+
+
+def parse_compensation(data: dict) -> Compensation | None:
+ pay_period = data.get("payPeriod")
+ adjusted_pay = data.get("payPeriodAdjustedPay")
+ currency = data.get("payCurrency", "USD")
+ if not pay_period or not adjusted_pay:
+ return None
+
+ interval = None
+ if pay_period == "ANNUAL":
+ interval = CompensationInterval.YEARLY
+ elif pay_period:
+ interval = CompensationInterval.get_interval(pay_period)
+ min_amount = int(adjusted_pay.get("p10") // 1)
+ max_amount = int(adjusted_pay.get("p90") // 1)
+ return Compensation(
+ interval=interval,
+ min_amount=min_amount,
+ max_amount=max_amount,
+ currency=currency,
+ )
+
+
+def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
+ for job_type in JobType:
+ if job_type_str in job_type.value:
+ return [job_type]
+
+
+def parse_location(location_name: str) -> Location | None:
+ if not location_name or location_name == "Remote":
+ return
+ city, _, state = location_name.partition(", ")
+ return Location(city=city, state=state)
+
+
+def get_cursor_for_page(pagination_cursors, page_num):
+ for cursor_data in pagination_cursors:
+ if cursor_data["pageNumber"] == page_num:
+ return cursor_data["cursor"]
diff --git a/src/jobspy/scrapers/google/__init__.py b/jobspy/google/__init__.py
similarity index 78%
rename from src/jobspy/scrapers/google/__init__.py
rename to jobspy/google/__init__.py
index 0c47278..e77903c 100644
--- a/src/jobspy/scrapers/google/__init__.py
+++ b/jobspy/google/__init__.py
@@ -1,10 +1,3 @@
-"""
-jobspy.scrapers.google
-~~~~~~~~~~~~~~~~~~~
-
-This module contains routines to scrape Google.
-"""
-
from __future__ import annotations
import math
@@ -13,23 +6,21 @@ import json
from typing import Tuple
from datetime import datetime, timedelta
-from .constants import headers_jobs, headers_initial, async_param
-from .. import Scraper, ScraperInput, Site
-from ..utils import extract_emails_from_text, create_logger, extract_job_type
-from ..utils import (
- create_session,
-)
-from ...jobs import (
+from jobspy.google.constant import headers_jobs, headers_initial, async_param
+from jobspy.model import (
+ Scraper,
+ ScraperInput,
+ Site,
JobPost,
JobResponse,
Location,
JobType,
)
-
-log = create_logger("Google")
+from jobspy.util import extract_emails_from_text, extract_job_type, create_session
+from jobspy.google.util import log, find_job_info_initial_page, find_job_info
-class GoogleJobsScraper(Scraper):
+class Google(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
@@ -135,7 +126,7 @@ class GoogleJobsScraper(Scraper):
pattern_fc = r'
]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None
- jobs_raw = self._find_job_info_initial_page(response.text)
+ jobs_raw = find_job_info_initial_page(response.text)
jobs = []
for job_raw in jobs_raw:
job_post = self._parse_job(job_raw)
@@ -167,7 +158,7 @@ class GoogleJobsScraper(Scraper):
continue
job_d = json.loads(job_data)
- job_info = self._find_job_info(job_d)
+ job_info = find_job_info(job_d)
job_post = self._parse_job(job_info)
if job_post:
jobs_on_page.append(job_post)
@@ -209,39 +200,3 @@ class GoogleJobsScraper(Scraper):
job_type=extract_job_type(description),
)
return job_post
-
- @staticmethod
- def _find_job_info(jobs_data: list | dict) -> list | None:
- """Iterates through the JSON data to find the job listings"""
- if isinstance(jobs_data, dict):
- for key, value in jobs_data.items():
- if key == "520084652" and isinstance(value, list):
- return value
- else:
- result = GoogleJobsScraper._find_job_info(value)
- if result:
- return result
- elif isinstance(jobs_data, list):
- for item in jobs_data:
- result = GoogleJobsScraper._find_job_info(item)
- if result:
- return result
- return None
-
- @staticmethod
- def _find_job_info_initial_page(html_text: str):
- pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
- results = []
- matches = re.finditer(pattern, html_text)
-
- import json
-
- for match in matches:
- try:
- parsed_data = json.loads(match.group(1))
- results.append(parsed_data)
-
- except json.JSONDecodeError as e:
- log.error(f"Failed to parse match: {str(e)}")
- results.append({"raw_match": match.group(0), "error": str(e)})
- return results
diff --git a/src/jobspy/scrapers/google/constants.py b/jobspy/google/constant.py
similarity index 100%
rename from src/jobspy/scrapers/google/constants.py
rename to jobspy/google/constant.py
diff --git a/jobspy/google/util.py b/jobspy/google/util.py
new file mode 100644
index 0000000..89c059b
--- /dev/null
+++ b/jobspy/google/util.py
@@ -0,0 +1,41 @@
+import re
+
+from jobspy.util import create_logger
+
+log = create_logger("Google")
+
+
+def find_job_info(jobs_data: list | dict) -> list | None:
+ """Iterates through the JSON data to find the job listings"""
+ if isinstance(jobs_data, dict):
+ for key, value in jobs_data.items():
+ if key == "520084652" and isinstance(value, list):
+ return value
+ else:
+ result = find_job_info(value)
+ if result:
+ return result
+ elif isinstance(jobs_data, list):
+ for item in jobs_data:
+ result = find_job_info(item)
+ if result:
+ return result
+ return None
+
+
+def find_job_info_initial_page(html_text: str):
+ pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
+ results = []
+ matches = re.finditer(pattern, html_text)
+
+ import json
+
+ for match in matches:
+ try:
+ parsed_data = json.loads(match.group(1))
+ results.append(parsed_data)
+
+ except json.JSONDecodeError as e:
+ log.error(f"Failed to parse match: {str(e)}")
+ results.append({"raw_match": match.group(0), "error": str(e)})
+ return results
diff --git a/src/jobspy/scrapers/indeed/__init__.py b/jobspy/indeed/__init__.py
similarity index 70%
rename from src/jobspy/scrapers/indeed/__init__.py
rename to jobspy/indeed/__init__.py
index b9235ae..adbc9e9 100644
--- a/src/jobspy/scrapers/indeed/__init__.py
+++ b/jobspy/indeed/__init__.py
@@ -1,39 +1,32 @@
-"""
-jobspy.scrapers.indeed
-~~~~~~~~~~~~~~~~~~~
-
-This module contains routines to scrape Indeed.
-"""
-
from __future__ import annotations
import math
-from typing import Tuple
from datetime import datetime
+from typing import Tuple
-from .constants import job_search_query, api_headers
-from .. import Scraper, ScraperInput, Site
-from ..utils import (
- extract_emails_from_text,
- get_enum_from_job_type,
- markdown_converter,
- create_session,
- create_logger,
-)
-from ...jobs import (
+from jobspy.indeed.constant import job_search_query, api_headers
+from jobspy.indeed.util import is_job_remote, get_compensation, get_job_type
+from jobspy.model import (
+ Scraper,
+ ScraperInput,
+ Site,
JobPost,
- Compensation,
- CompensationInterval,
Location,
JobResponse,
JobType,
DescriptionFormat,
)
+from jobspy.util import (
+ extract_emails_from_text,
+ markdown_converter,
+ create_session,
+ create_logger,
+)
log = create_logger("Indeed")
-class IndeedScraper(Scraper):
+class Indeed(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
@@ -213,7 +206,7 @@ class IndeedScraper(Scraper):
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
- job_type = self._get_job_type(job["attributes"])
+ job_type = get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
employer = job["employer"].get("dossier") if job["employer"] else None
@@ -234,14 +227,14 @@ class IndeedScraper(Scraper):
country=job.get("location", {}).get("countryCode"),
),
job_type=job_type,
- compensation=self._get_compensation(job["compensation"]),
+ compensation=get_compensation(job["compensation"]),
date_posted=date_posted,
job_url=job_url,
job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
),
emails=extract_emails_from_text(description) if description else None,
- is_remote=self._is_job_remote(job, description),
+ is_remote=is_job_remote(job, description),
company_addresses=(
employer_details["addresses"][0]
if employer_details.get("addresses")
@@ -265,86 +258,3 @@ class IndeedScraper(Scraper):
else None
),
)
-
- @staticmethod
- def _get_job_type(attributes: list) -> list[JobType]:
- """
- Parses the attributes to get list of job types
- :param attributes:
- :return: list of JobType
- """
- job_types: list[JobType] = []
- for attribute in attributes:
- job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
- job_type = get_enum_from_job_type(job_type_str)
- if job_type:
- job_types.append(job_type)
- return job_types
-
- @staticmethod
- def _get_compensation(compensation: dict) -> Compensation | None:
- """
- Parses the job to get compensation
- :param job:
- :return: compensation object
- """
- if not compensation["baseSalary"] and not compensation["estimated"]:
- return None
- comp = (
- compensation["baseSalary"]
- if compensation["baseSalary"]
- else compensation["estimated"]["baseSalary"]
- )
- if not comp:
- return None
- interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
- if not interval:
- return None
- min_range = comp["range"].get("min")
- max_range = comp["range"].get("max")
- return Compensation(
- interval=interval,
- min_amount=int(min_range) if min_range is not None else None,
- max_amount=int(max_range) if max_range is not None else None,
- currency=(
- compensation["estimated"]["currencyCode"]
- if compensation["estimated"]
- else compensation["currencyCode"]
- ),
- )
-
- @staticmethod
- def _is_job_remote(job: dict, description: str) -> bool:
- """
- Searches the description, location, and attributes to check if job is remote
- """
- remote_keywords = ["remote", "work from home", "wfh"]
- is_remote_in_attributes = any(
- any(keyword in attr["label"].lower() for keyword in remote_keywords)
- for attr in job["attributes"]
- )
- is_remote_in_description = any(
- keyword in description.lower() for keyword in remote_keywords
- )
- is_remote_in_location = any(
- keyword in job["location"]["formatted"]["long"].lower()
- for keyword in remote_keywords
- )
- return (
- is_remote_in_attributes or is_remote_in_description or is_remote_in_location
- )
-
- @staticmethod
- def _get_compensation_interval(interval: str) -> CompensationInterval:
- interval_mapping = {
- "DAY": "DAILY",
- "YEAR": "YEARLY",
- "HOUR": "HOURLY",
- "WEEK": "WEEKLY",
- "MONTH": "MONTHLY",
- }
- mapped_interval = interval_mapping.get(interval.upper(), None)
- if mapped_interval and mapped_interval in CompensationInterval.__members__:
- return CompensationInterval[mapped_interval]
- else:
- raise ValueError(f"Unsupported interval: {interval}")
diff --git a/src/jobspy/scrapers/indeed/constants.py b/jobspy/indeed/constant.py
similarity index 100%
rename from src/jobspy/scrapers/indeed/constants.py
rename to jobspy/indeed/constant.py
diff --git a/jobspy/indeed/util.py b/jobspy/indeed/util.py
new file mode 100644
index 0000000..5515ed1
--- /dev/null
+++ b/jobspy/indeed/util.py
@@ -0,0 +1,83 @@
+from jobspy.model import CompensationInterval, JobType, Compensation
+from jobspy.util import get_enum_from_job_type
+
+
+def get_job_type(attributes: list) -> list[JobType]:
+ """
+ Parses the attributes to get list of job types
+ :param attributes:
+ :return: list of JobType
+ """
+ job_types: list[JobType] = []
+ for attribute in attributes:
+ job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
+ job_type = get_enum_from_job_type(job_type_str)
+ if job_type:
+ job_types.append(job_type)
+ return job_types
+
+
+def get_compensation(compensation: dict) -> Compensation | None:
+ """
+ Parses the job to get compensation
+ :param sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrompensation:
+ :return: compensation object
+ """
+ if not compensation["baseSalary"] and not compensation["estimated"]:
+ return None
+ comp = (
+ compensation["baseSalary"]
+ if compensation["baseSalary"]
+ else compensation["estimated"]["baseSalary"]
+ )
+ if not comp:
+ return None
+ interval = get_compensation_interval(comp["unitOfWork"])
+ if not interval:
+ return None
+ min_range = comp["range"].get("min")
+ max_range = comp["range"].get("max")
+ return Compensation(
+ interval=interval,
+ min_amount=int(min_range) if min_range is not None else None,
+ max_amount=int(max_range) if max_range is not None else None,
+ currency=(
+ compensation["estimated"]["currencyCode"]
+ if compensation["estimated"]
+ else compensation["currencyCode"]
+ ),
+ )
+
+
+def is_job_remote(job: dict, description: str) -> bool:
+ """
+ Searches the description, location, and attributes to check if job is remote
+ """
+ remote_keywords = ["remote", "work from home", "wfh"]
+ is_remote_in_attributes = any(
+ any(keyword in attr["label"].lower() for keyword in remote_keywords)
+ for attr in job["attributes"]
+ )
+ is_remote_in_description = any(
+ keyword in description.lower() for keyword in remote_keywords
+ )
+ is_remote_in_location = any(
+ keyword in job["location"]["formatted"]["long"].lower()
+ for keyword in remote_keywords
+ )
+ return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
+
+
+def get_compensation_interval(interval: str) -> CompensationInterval:
+ interval_mapping = {
+ "DAY": "DAILY",
+ "YEAR": "YEARLY",
+ "HOUR": "HOURLY",
+ "WEEK": "WEEKLY",
+ "MONTH": "MONTHLY",
+ }
+ mapped_interval = interval_mapping.get(interval.upper(), None)
+ if mapped_interval and mapped_interval in CompensationInterval.__members__:
+ return CompensationInterval[mapped_interval]
+ else:
+ raise ValueError(f"Unsupported interval: {interval}")
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/jobspy/linkedin/__init__.py
similarity index 77%
rename from src/jobspy/scrapers/linkedin/__init__.py
rename to jobspy/linkedin/__init__.py
index d854cbe..db20f12 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/jobspy/linkedin/__init__.py
@@ -1,47 +1,48 @@
-"""
-jobspy.scrapers.linkedin
-~~~~~~~~~~~~~~~~~~~
-
-This module contains routines to scrape LinkedIn.
-"""
-
from __future__ import annotations
import math
-import time
import random
-import regex as re
-from typing import Optional
+import time
from datetime import datetime
-
-from bs4.element import Tag
-from bs4 import BeautifulSoup
+from typing import Optional
from urllib.parse import urlparse, urlunparse, unquote
-from .constants import headers
-from .. import Scraper, ScraperInput, Site
-from ..exceptions import LinkedInException
-from ..utils import create_session, remove_attributes, create_logger
-from ...jobs import (
+import regex as re
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+
+from jobspy.exception import LinkedInException
+from jobspy.linkedin.constant import headers
+from jobspy.linkedin.util import (
+ job_type_code,
+ parse_job_type,
+ parse_job_level,
+ parse_company_industry,
+)
+from jobspy.model import (
JobPost,
Location,
JobResponse,
- JobType,
Country,
Compensation,
DescriptionFormat,
+ Scraper,
+ ScraperInput,
+ Site,
)
-from ..utils import (
+from jobspy.util import (
extract_emails_from_text,
- get_enum_from_job_type,
currency_parser,
markdown_converter,
+ create_session,
+ remove_attributes,
+ create_logger,
)
log = create_logger("LinkedIn")
-class LinkedInScraper(Scraper):
+class LinkedIn(Scraper):
base_url = "https://www.linkedin.com"
delay = 3
band_delay = 4
@@ -95,7 +96,7 @@ class LinkedInScraper(Scraper):
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": (
- self.job_type_code(scraper_input.job_type)
+ job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None
),
@@ -282,9 +283,9 @@ class LinkedInScraper(Scraper):
)
return {
"description": description,
- "job_level": self._parse_job_level(soup),
- "company_industry": self._parse_company_industry(soup),
- "job_type": self._parse_job_type(soup),
+ "job_level": parse_job_level(soup),
+ "company_industry": parse_company_industry(soup),
+ "job_type": parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"company_logo": company_logo,
"job_function": job_function,
@@ -316,77 +317,6 @@ class LinkedInScraper(Scraper):
location = Location(city=city, state=state, country=country)
return location
- @staticmethod
- def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
- """
- Gets the job type from job page
- :param soup_job_type:
- :return: JobType
- """
- h3_tag = soup_job_type.find(
- "h3",
- class_="description__job-criteria-subheader",
- string=lambda text: "Employment type" in text,
- )
- employment_type = None
- if h3_tag:
- employment_type_span = h3_tag.find_next_sibling(
- "span",
- class_="description__job-criteria-text description__job-criteria-text--criteria",
- )
- if employment_type_span:
- employment_type = employment_type_span.get_text(strip=True)
- employment_type = employment_type.lower()
- employment_type = employment_type.replace("-", "")
-
- return [get_enum_from_job_type(employment_type)] if employment_type else []
-
- @staticmethod
- def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
- """
- Gets the job level from job page
- :param soup_job_level:
- :return: str
- """
- h3_tag = soup_job_level.find(
- "h3",
- class_="description__job-criteria-subheader",
- string=lambda text: "Seniority level" in text,
- )
- job_level = None
- if h3_tag:
- job_level_span = h3_tag.find_next_sibling(
- "span",
- class_="description__job-criteria-text description__job-criteria-text--criteria",
- )
- if job_level_span:
- job_level = job_level_span.get_text(strip=True)
-
- return job_level
-
- @staticmethod
- def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
- """
- Gets the company industry from job page
- :param soup_industry:
- :return: str
- """
- h3_tag = soup_industry.find(
- "h3",
- class_="description__job-criteria-subheader",
- string=lambda text: "Industries" in text,
- )
- industry = None
- if h3_tag:
- industry_span = h3_tag.find_next_sibling(
- "span",
- class_="description__job-criteria-text description__job-criteria-text--criteria",
- )
- if industry_span:
- industry = industry_span.get_text(strip=True)
-
- return industry
-
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
@@ -403,13 +333,3 @@ class LinkedInScraper(Scraper):
job_url_direct = unquote(job_url_direct_match.group())
return job_url_direct
-
- @staticmethod
- def job_type_code(job_type_enum: JobType) -> str:
- return {
- JobType.FULL_TIME: "F",
- JobType.PART_TIME: "P",
- JobType.INTERNSHIP: "I",
- JobType.CONTRACT: "C",
- JobType.TEMPORARY: "T",
- }.get(job_type_enum, "")
diff --git a/src/jobspy/scrapers/linkedin/constants.py b/jobspy/linkedin/constant.py
similarity index 100%
rename from src/jobspy/scrapers/linkedin/constants.py
rename to jobspy/linkedin/constant.py
diff --git a/jobspy/linkedin/util.py b/jobspy/linkedin/util.py
new file mode 100644
index 0000000..fe37c48
--- /dev/null
+++ b/jobspy/linkedin/util.py
@@ -0,0 +1,85 @@
+from bs4 import BeautifulSoup
+
+from jobspy.model import JobType
+from jobspy.util import get_enum_from_job_type
+
+
+def job_type_code(job_type_enum: JobType) -> str:
+ return {
+ JobType.FULL_TIME: "F",
+ JobType.PART_TIME: "P",
+ JobType.INTERNSHIP: "I",
+ JobType.CONTRACT: "C",
+ JobType.TEMPORARY: "T",
+ }.get(job_type_enum, "")
+
+
+def parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
+ """
+ Gets the job type from job page
+ :param soup_job_type:
+ :return: JobType
+ """
+ h3_tag = soup_job_type.find(
+ "h3",
+ class_="description__job-criteria-subheader",
+ string=lambda text: "Employment type" in text,
+ )
+ employment_type = None
+ if h3_tag:
+ employment_type_span = h3_tag.find_next_sibling(
+ "span",
+ class_="description__job-criteria-text description__job-criteria-text--criteria",
+ )
+ if employment_type_span:
+ employment_type = employment_type_span.get_text(strip=True)
+ employment_type = employment_type.lower()
+ employment_type = employment_type.replace("-", "")
+
+ return [get_enum_from_job_type(employment_type)] if employment_type else []
+
+
+def parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
+ """
+ Gets the job level from job page
+ :param soup_job_level:
+ :return: str
+ """
+ h3_tag = soup_job_level.find(
+ "h3",
+ class_="description__job-criteria-subheader",
+ string=lambda text: "Seniority level" in text,
+ )
+ job_level = None
+ if h3_tag:
+ job_level_span = h3_tag.find_next_sibling(
+ "span",
+ class_="description__job-criteria-text description__job-criteria-text--criteria",
+ )
+ if job_level_span:
+ job_level = job_level_span.get_text(strip=True)
+
+ return job_level
+
+
+def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
+ """
+ Gets the company industry from job page
+ :param soup_industry:
+ :return: str
+ """
+ h3_tag = soup_industry.find(
+ "h3",
+ class_="description__job-criteria-subheader",
+ string=lambda text: "Industries" in text,
+ )
+ industry = None
+ if h3_tag:
+ industry_span = h3_tag.find_next_sibling(
+ "span",
+ class_="description__job-criteria-text description__job-criteria-text--criteria",
+ )
+ if industry_span:
+ industry = industry_span.get_text(strip=True)
+
+ return industry
diff --git a/src/jobspy/jobs/__init__.py b/jobspy/model.py
similarity index 87%
rename from src/jobspy/jobs/__init__.py
rename to jobspy/model.py
index c51839c..b11e163 100644
--- a/src/jobspy/jobs/__init__.py
+++ b/jobspy/model.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+from abc import ABC, abstractmethod
from typing import Optional
from datetime import date
from enum import Enum
@@ -265,3 +266,49 @@ class JobPost(BaseModel):
class JobResponse(BaseModel):
jobs: list[JobPost] = []
+
+
+class Site(Enum):
+ LINKEDIN = "linkedin"
+ INDEED = "indeed"
+ ZIP_RECRUITER = "zip_recruiter"
+ GLASSDOOR = "glassdoor"
+ GOOGLE = "google"
+ BAYT = "bayt"
+
+
+class SalarySource(Enum):
+ DIRECT_DATA = "direct_data"
+ DESCRIPTION = "description"
+
+
+class ScraperInput(BaseModel):
+ site_type: list[Site]
+ search_term: str | None = None
+ google_search_term: str | None = None
+
+ location: str | None = None
+ country: Country | None = Country.USA
+ distance: int | None = None
+ is_remote: bool = False
+ job_type: JobType | None = None
+ easy_apply: bool | None = None
+ offset: int = 0
+ linkedin_fetch_description: bool = False
+ linkedin_company_ids: list[int] | None = None
+ description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
+
+ results_wanted: int = 15
+ hours_old: int | None = None
+
+
+class Scraper(ABC):
+ def __init__(
+ self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
+ ):
+ self.site = site
+ self.proxies = proxies
+ self.ca_cert = ca_cert
+
+ @abstractmethod
+ def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
diff --git a/src/jobspy/scrapers/utils.py b/jobspy/util.py
similarity index 86%
rename from src/jobspy/scrapers/utils.py
rename to jobspy/util.py
index 32e0663..36c13e7 100644
--- a/src/jobspy/scrapers/utils.py
+++ b/jobspy/util.py
@@ -11,7 +11,7 @@ import urllib3
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
-from ..jobs import CompensationInterval, JobType
+from jobspy.model import CompensationInterval, JobType, Site
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -286,3 +286,62 @@ def extract_job_type(description: str):
listing_types.append(key)
return listing_types if listing_types else None
+
+
+def map_str_to_site(site_name: str) -> Site:
+ return Site[site_name.upper()]
+
+
+def get_enum_from_value(value_str):
+ for job_type in JobType:
+ if value_str in job_type.value:
+ return job_type
+ raise Exception(f"Invalid job type: {value_str}")
+
+
+def convert_to_annual(job_data: dict):
+ if job_data["interval"] == "hourly":
+ job_data["min_amount"] *= 2080
+ job_data["max_amount"] *= 2080
+ if job_data["interval"] == "monthly":
+ job_data["min_amount"] *= 12
+ job_data["max_amount"] *= 12
+ if job_data["interval"] == "weekly":
+ job_data["min_amount"] *= 52
+ job_data["max_amount"] *= 52
+ if job_data["interval"] == "daily":
+ job_data["min_amount"] *= 260
+ job_data["max_amount"] *= 260
+ job_data["interval"] = "yearly"
+
+
+desired_order = [
+ "id",
+ "site",
+ "job_url",
+ "job_url_direct",
+ "title",
+ "company",
+ "location",
+ "date_posted",
+ "job_type",
+ "salary_source",
+ "interval",
+ "min_amount",
+ "max_amount",
+ "currency",
+ "is_remote",
+ "job_level",
+ "job_function",
+ "listing_type",
+ "emails",
+ "description",
+ "company_industry",
+ "company_url",
+ "company_logo",
+ "company_url_direct",
+ "company_addresses",
+ "company_num_employees",
+ "company_revenue",
+ "company_description",
+]
diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/jobspy/ziprecruiter/__init__.py
similarity index 73%
rename from src/jobspy/scrapers/ziprecruiter/__init__.py
rename to jobspy/ziprecruiter/__init__.py
index 816331e..c91ba57 100644
--- a/src/jobspy/scrapers/ziprecruiter/__init__.py
+++ b/jobspy/ziprecruiter/__init__.py
@@ -1,10 +1,3 @@
-"""
-jobspy.scrapers.ziprecruiter
-~~~~~~~~~~~~~~~~~~~
-
-This module contains routines to scrape ZipRecruiter.
-"""
-
from __future__ import annotations
import json
@@ -13,33 +6,34 @@ import re
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
-from typing import Optional, Tuple, Any
from bs4 import BeautifulSoup
-from .constants import headers
-from .. import Scraper, ScraperInput, Site
-from ..utils import (
+from jobspy.ziprecruiter.constant import headers, get_cookie_data
+from jobspy.util import (
extract_emails_from_text,
create_session,
markdown_converter,
remove_attributes,
create_logger,
)
-from ...jobs import (
+from jobspy.model import (
JobPost,
Compensation,
Location,
JobResponse,
- JobType,
Country,
DescriptionFormat,
+ Scraper,
+ ScraperInput,
+ Site,
)
+from jobspy.ziprecruiter.util import get_job_type_enum, add_params
log = create_logger("ZipRecruiter")
-class ZipRecruiterScraper(Scraper):
+class ZipRecruiter(Scraper):
base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com"
@@ -90,7 +84,7 @@ class ZipRecruiterScraper(Scraper):
def _find_jobs_in_page(
self, scraper_input: ScraperInput, continue_token: str | None = None
- ) -> Tuple[list[JobPost], Optional[str]]:
+ ) -> tuple[list[JobPost], str | None]:
"""
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
@@ -98,7 +92,7 @@ class ZipRecruiterScraper(Scraper):
:return: jobs found on page
"""
jobs_list = []
- params = self._add_params(scraper_input)
+ params = add_params(scraper_input)
if continue_token:
params["continue_from"] = continue_token
try:
@@ -151,7 +145,7 @@ class ZipRecruiterScraper(Scraper):
location = Location(
city=job.get("job_city"), state=job.get("job_state"), country=country_enum
)
- job_type = self._get_job_type_enum(
+ job_type = get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower()
)
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
@@ -200,13 +194,17 @@ class ZipRecruiterScraper(Scraper):
else ""
)
description_full = job_description_clean + company_description_clean
- script_tag = soup.find("script", type="application/json")
- if script_tag:
- job_json = json.loads(script_tag.string)
- job_url_val = job_json["model"].get("saveJobURL", "")
- m = re.search(r"job_url=(.+)", job_url_val)
- if m:
- job_url_direct = m.group(1)
+
+ try:
+ script_tag = soup.find("script", type="application/json")
+ if script_tag:
+ job_json = json.loads(script_tag.string)
+ job_url_val = job_json["model"].get("saveJobURL", "")
+ m = re.search(r"job_url=(.+)", job_url_val)
+ if m:
+ job_url_direct = m.group(1)
+ except:
+ job_url_direct = None
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description_full = markdown_converter(description_full)
@@ -217,51 +215,5 @@ class ZipRecruiterScraper(Scraper):
"""
Sends a session event to the API with device properties.
"""
- data = [
- ("event_type", "session"),
- ("logged_in", "false"),
- ("number_of_retry", "1"),
- ("property", "model:iPhone"),
- ("property", "os:iOS"),
- ("property", "locale:en_us"),
- ("property", "app_build_number:4734"),
- ("property", "app_version:91.0"),
- ("property", "manufacturer:Apple"),
- ("property", "timestamp:2025-01-12T12:04:42-06:00"),
- ("property", "screen_height:852"),
- ("property", "os_version:16.6.1"),
- ("property", "source:install"),
- ("property", "screen_width:393"),
- ("property", "device_model:iPhone 14 Pro"),
- ("property", "brand:Apple"),
- ]
-
url = f"{self.api_url}/jobs-app/event"
- self.session.post(url, data=data)
-
- @staticmethod
- def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
- for job_type in JobType:
- if job_type_str in job_type.value:
- return [job_type]
- return None
-
- @staticmethod
- def _add_params(scraper_input) -> dict[str, str | Any]:
- params = {
- "search": scraper_input.search_term,
- "location": scraper_input.location,
- }
- if scraper_input.hours_old:
- params["days"] = max(scraper_input.hours_old // 24, 1)
- job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
- if scraper_input.job_type:
- job_type = scraper_input.job_type
- params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
- if scraper_input.easy_apply:
- params["zipapply"] = 1
- if scraper_input.is_remote:
- params["remote"] = 1
- if scraper_input.distance:
- params["radius"] = scraper_input.distance
- return {k: v for k, v in params.items() if v is not None}
+ self.session.post(url, data=get_cookie_data)
diff --git a/jobspy/ziprecruiter/constant.py b/jobspy/ziprecruiter/constant.py
new file mode 100644
index 0000000..2bf8371
--- /dev/null
+++ b/jobspy/ziprecruiter/constant.py
@@ -0,0 +1,29 @@
+headers = {
+ "Host": "api.ziprecruiter.com",
+ "accept": "*/*",
+ "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
+ "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
+ "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
+ "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
+ "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
+ "accept-language": "en-US,en;q=0.9",
+}
+
+get_cookie_data = [
+ ("event_type", "session"),
+ ("logged_in", "false"),
+ ("number_of_retry", "1"),
+ ("property", "model:iPhone"),
+ ("property", "os:iOS"),
+ ("property", "locale:en_us"),
+ ("property", "app_build_number:4734"),
+ ("property", "app_version:91.0"),
+ ("property", "manufacturer:Apple"),
+ ("property", "timestamp:2025-01-12T12:04:42-06:00"),
+ ("property", "screen_height:852"),
+ ("property", "os_version:16.6.1"),
+ ("property", "source:install"),
+ ("property", "screen_width:393"),
+ ("property", "device_model:iPhone 14 Pro"),
+ ("property", "brand:Apple"),
+]
diff --git a/jobspy/ziprecruiter/util.py b/jobspy/ziprecruiter/util.py
new file mode 100644
index 0000000..ba2f39d
--- /dev/null
+++ b/jobspy/ziprecruiter/util.py
@@ -0,0 +1,31 @@
+from jobspy.model import JobType
+
+
+def add_params(scraper_input) -> dict[str, str | int]:
+ params: dict[str, str | int] = {
+ "search": scraper_input.search_term,
+ "location": scraper_input.location,
+ }
+ if scraper_input.hours_old:
+ params["days"] = max(scraper_input.hours_old // 24, 1)
+
+ job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
+ if scraper_input.job_type:
+ job_type = scraper_input.job_type
+ params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
+
+ if scraper_input.easy_apply:
+ params["zipapply"] = 1
+ if scraper_input.is_remote:
+ params["remote"] = 1
+ if scraper_input.distance:
+ params["radius"] = scraper_input.distance
+
+ return {k: v for k, v in params.items() if v is not None}
+
+
+def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
+ for job_type in JobType:
+ if job_type_str in job_type.value:
+ return [job_type]
+ return None
diff --git a/pyproject.toml b/pyproject.toml
index c4275a7..3fad5fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,15 +4,14 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "python-jobspy"
-version = "1.1.76"
-description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
-authors = [ "Zachary Hampton ", "Cullen Watson ",]
-homepage = "https://github.com/Bunsly/JobSpy"
+version = "1.1.77"
+description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
+authors = ["Cullen Watson ", "Zachary Hampton "]
+homepage = "https://github.com/cullenwatson/JobSpy"
readme = "README.md"
-keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",]
+keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt"]
[[tool.poetry.packages]]
include = "jobspy"
-from = "src"
[tool.black]
line-length = 88
@@ -29,7 +28,6 @@ markdownify = "^0.13.1"
regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies]
-pytest = "^7.4.1"
jupyter = "^1.0.0"
black = "*"
pre-commit = "*"
diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py
deleted file mode 100644
index 63c00d5..0000000
--- a/src/jobspy/scrapers/__init__.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-
-from ..jobs import (
- Enum,
- BaseModel,
- JobType,
- JobResponse,
- Country,
- DescriptionFormat,
-)
-
-
-class Site(Enum):
- LINKEDIN = "linkedin"
- INDEED = "indeed"
- ZIP_RECRUITER = "zip_recruiter"
- GLASSDOOR = "glassdoor"
- GOOGLE = "google"
- BAYT = "bayt"
-
-
-class SalarySource(Enum):
- DIRECT_DATA = "direct_data"
- DESCRIPTION = "description"
-
-
-class ScraperInput(BaseModel):
- site_type: list[Site]
- search_term: str | None = None
- google_search_term: str | None = None
-
- location: str | None = None
- country: Country | None = Country.USA
- distance: int | None = None
- is_remote: bool = False
- job_type: JobType | None = None
- easy_apply: bool | None = None
- offset: int = 0
- linkedin_fetch_description: bool = False
- linkedin_company_ids: list[int] | None = None
- description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
-
- results_wanted: int = 15
- hours_old: int | None = None
-
-
-class Scraper(ABC):
- def __init__(
- self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
- ):
- self.site = site
- self.proxies = proxies
- self.ca_cert = ca_cert
-
- @abstractmethod
- def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
diff --git a/src/jobspy/scrapers/ziprecruiter/constants.py b/src/jobspy/scrapers/ziprecruiter/constants.py
deleted file mode 100644
index 7e179c9..0000000
--- a/src/jobspy/scrapers/ziprecruiter/constants.py
+++ /dev/null
@@ -1,10 +0,0 @@
-headers = {
- "Host": "api.ziprecruiter.com",
- "accept": "*/*",
- "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
- "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
- "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
- "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
- "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
- "accept-language": "en-US,en;q=0.9",
-}