mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
60d4d911c9 | ||
|
|
2a0cba8c7e | ||
|
|
de70189fa2 | ||
|
|
b55c0eb86d |
63
README.md
63
README.md
@@ -110,6 +110,9 @@ Optional
|
||||
|
|
||||
├── country_indeed (str):
|
||||
| filters the country on Indeed & Glassdoor (see below for correct spelling)
|
||||
|
|
||||
├── enforce_annual_salary (bool):
|
||||
| converts wages to annual salary
|
||||
```
|
||||
|
||||
```
|
||||
@@ -130,42 +133,42 @@ Optional
|
||||
|
||||
```plaintext
|
||||
JobPost
|
||||
├── title (str)
|
||||
├── company (str)
|
||||
├── company_url (str)
|
||||
├── job_url (str)
|
||||
├── location (object)
|
||||
│ ├── country (str)
|
||||
│ ├── city (str)
|
||||
│ ├── state (str)
|
||||
├── description (str)
|
||||
├── job_type (str): fulltime, parttime, internship, contract
|
||||
├── job_function (str)
|
||||
├── compensation (object)
|
||||
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
||||
│ ├── min_amount (int)
|
||||
│ ├── max_amount (int)
|
||||
│ └── currency (enum)
|
||||
├── date_posted (date)
|
||||
├── emails (str)
|
||||
└── is_remote (bool)
|
||||
├── title
|
||||
├── company
|
||||
├── company_url
|
||||
├── job_url
|
||||
├── location
|
||||
│ ├── country
|
||||
│ ├── city
|
||||
│ ├── state
|
||||
├── description
|
||||
├── job_type: fulltime, parttime, internship, contract
|
||||
├── job_function
|
||||
│ ├── interval: yearly, monthly, weekly, daily, hourly
|
||||
│ ├── min_amount
|
||||
│ ├── max_amount
|
||||
│ ├── currency
|
||||
│ └── salary_source: direct_data, description (parsed from posting)
|
||||
├── date_posted
|
||||
├── emails
|
||||
└── is_remote
|
||||
|
||||
Linkedin specific
|
||||
└── job_level (str)
|
||||
└── job_level
|
||||
|
||||
Linkedin & Indeed specific
|
||||
└── company_industry (str)
|
||||
└── company_industry
|
||||
|
||||
Indeed specific
|
||||
├── company_country (str)
|
||||
└── company_addresses (str)
|
||||
└── company_employees_label (str)
|
||||
└── company_revenue_label (str)
|
||||
└── company_description (str)
|
||||
└── ceo_name (str)
|
||||
└── ceo_photo_url (str)
|
||||
└── logo_photo_url (str)
|
||||
└── banner_photo_url (str)
|
||||
├── company_country
|
||||
├── company_addresses
|
||||
├── company_employees_label
|
||||
├── company_revenue_label
|
||||
├── company_description
|
||||
├── ceo_name
|
||||
├── ceo_photo_url
|
||||
├── logo_photo_url
|
||||
└── banner_photo_url
|
||||
```
|
||||
|
||||
## Supported Countries for Job Searching
|
||||
|
||||
1228
poetry.lock
generated
1228
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.59"
|
||||
version = "1.1.60"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
@@ -15,7 +15,7 @@ python = "^3.10"
|
||||
requests = "^2.31.0"
|
||||
beautifulsoup4 = "^4.12.2"
|
||||
pandas = "^2.1.0"
|
||||
NUMPY = "1.24.2"
|
||||
NUMPY = "1.26.3"
|
||||
pydantic = "^2.3.0"
|
||||
tls-client = "^1.0.1"
|
||||
markdownify = "^0.11.6"
|
||||
|
||||
@@ -10,7 +10,7 @@ from .scrapers.indeed import IndeedScraper
|
||||
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||
from .scrapers.glassdoor import GlassdoorScraper
|
||||
from .scrapers.linkedin import LinkedInScraper
|
||||
from .scrapers import ScraperInput, Site, JobResponse, Country
|
||||
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
||||
from .scrapers.exceptions import (
|
||||
LinkedInException,
|
||||
IndeedException,
|
||||
@@ -36,6 +36,7 @@ def scrape_jobs(
|
||||
linkedin_company_ids: list[int] | None = None,
|
||||
offset: int | None = 0,
|
||||
hours_old: int = None,
|
||||
enforce_annual_salary: bool = False,
|
||||
verbose: int = 2,
|
||||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
@@ -165,7 +166,8 @@ def scrape_jobs(
|
||||
job_data["min_amount"] = compensation_obj.get("min_amount")
|
||||
job_data["max_amount"] = compensation_obj.get("max_amount")
|
||||
job_data["currency"] = compensation_obj.get("currency", "USD")
|
||||
if (
|
||||
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
|
||||
if enforce_annual_salary and (
|
||||
job_data["interval"]
|
||||
and job_data["interval"] != "yearly"
|
||||
and job_data["min_amount"]
|
||||
@@ -180,8 +182,15 @@ def scrape_jobs(
|
||||
job_data["min_amount"],
|
||||
job_data["max_amount"],
|
||||
job_data["currency"],
|
||||
) = extract_salary(job_data["description"])
|
||||
) = extract_salary(
|
||||
job_data["description"],
|
||||
enforce_annual_salary=enforce_annual_salary,
|
||||
)
|
||||
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
||||
|
||||
job_data["salary_source"] = (
|
||||
job_data["salary_source"] if job_data["min_amount"] else None
|
||||
)
|
||||
job_df = pd.DataFrame([job_data])
|
||||
jobs_dfs.append(job_df)
|
||||
|
||||
@@ -203,6 +212,7 @@ def scrape_jobs(
|
||||
"location",
|
||||
"job_type",
|
||||
"date_posted",
|
||||
"salary_source",
|
||||
"interval",
|
||||
"min_amount",
|
||||
"max_amount",
|
||||
|
||||
@@ -18,6 +18,9 @@ class Site(Enum):
|
||||
ZIP_RECRUITER = "zip_recruiter"
|
||||
GLASSDOOR = "glassdoor"
|
||||
|
||||
class SalarySource(Enum):
|
||||
DIRECT_DATA = "direct_data"
|
||||
DESCRIPTION = "description"
|
||||
|
||||
class ScraperInput(BaseModel):
|
||||
site_type: list[Site]
|
||||
|
||||
@@ -10,7 +10,7 @@ import numpy as np
|
||||
from markdownify import markdownify as md
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
|
||||
from ..jobs import JobType
|
||||
from ..jobs import CompensationInterval, JobType
|
||||
|
||||
logger = logging.getLogger("JobSpy")
|
||||
logger.propagate = False
|
||||
@@ -193,6 +193,7 @@ def extract_salary(
|
||||
upper_limit=700000,
|
||||
hourly_threshold=350,
|
||||
monthly_threshold=30000,
|
||||
enforce_annual_salary=False,
|
||||
):
|
||||
if not salary_str:
|
||||
return None, None, None, None
|
||||
@@ -220,20 +221,30 @@ def extract_salary(
|
||||
|
||||
# Convert to annual if less than the hourly threshold
|
||||
if min_salary < hourly_threshold:
|
||||
min_salary = convert_hourly_to_annual(min_salary)
|
||||
interval = CompensationInterval.HOURLY.value
|
||||
annual_min_salary = convert_hourly_to_annual(min_salary)
|
||||
if max_salary < hourly_threshold:
|
||||
max_salary = convert_hourly_to_annual(max_salary)
|
||||
annual_max_salary = convert_hourly_to_annual(max_salary)
|
||||
|
||||
elif min_salary < monthly_threshold:
|
||||
min_salary = convert_monthly_to_annual(min_salary)
|
||||
interval = CompensationInterval.MONTHLY.value
|
||||
annual_min_salary = convert_monthly_to_annual(min_salary)
|
||||
if max_salary < monthly_threshold:
|
||||
max_salary = convert_monthly_to_annual(max_salary)
|
||||
annual_max_salary = convert_monthly_to_annual(max_salary)
|
||||
|
||||
else:
|
||||
interval = CompensationInterval.YEARLY.value
|
||||
annual_min_salary = min_salary
|
||||
annual_max_salary = max_salary
|
||||
|
||||
# Ensure salary range is within specified limits
|
||||
if (
|
||||
lower_limit <= min_salary <= upper_limit
|
||||
and lower_limit <= max_salary <= upper_limit
|
||||
and min_salary < max_salary
|
||||
lower_limit <= annual_min_salary <= upper_limit
|
||||
and lower_limit <= annual_max_salary <= upper_limit
|
||||
and annual_min_salary < annual_max_salary
|
||||
):
|
||||
return "yearly", min_salary, max_salary, "USD"
|
||||
if enforce_annual_salary:
|
||||
return interval, annual_min_salary, annual_max_salary, "USD"
|
||||
else:
|
||||
return interval, min_salary, max_salary, "USD"
|
||||
return None, None, None, None
|
||||
|
||||
Reference in New Issue
Block a user