mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0e046432d1 | ||
|
|
209e0e65b6 | ||
|
|
8570c0651e | ||
|
|
8678b0bbe4 | ||
|
|
60d4d911c9 | ||
|
|
2a0cba8c7e | ||
|
|
de70189fa2 | ||
|
|
b55c0eb86d |
22
.github/workflows/python-test.yml
vendored
Normal file
22
.github/workflows/python-test.yml
vendored
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
name: Python Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: '3.8'
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install poetry
|
||||||
|
poetry install
|
||||||
|
- name: Run tests
|
||||||
|
run: poetry run pytest src/tests/test_all.py
|
||||||
63
README.md
63
README.md
@@ -110,6 +110,9 @@ Optional
|
|||||||
|
|
|
|
||||||
├── country_indeed (str):
|
├── country_indeed (str):
|
||||||
| filters the country on Indeed & Glassdoor (see below for correct spelling)
|
| filters the country on Indeed & Glassdoor (see below for correct spelling)
|
||||||
|
|
|
||||||
|
├── enforce_annual_salary (bool):
|
||||||
|
| converts wages to annual salary
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -130,42 +133,42 @@ Optional
|
|||||||
|
|
||||||
```plaintext
|
```plaintext
|
||||||
JobPost
|
JobPost
|
||||||
├── title (str)
|
├── title
|
||||||
├── company (str)
|
├── company
|
||||||
├── company_url (str)
|
├── company_url
|
||||||
├── job_url (str)
|
├── job_url
|
||||||
├── location (object)
|
├── location
|
||||||
│ ├── country (str)
|
│ ├── country
|
||||||
│ ├── city (str)
|
│ ├── city
|
||||||
│ ├── state (str)
|
│ ├── state
|
||||||
├── description (str)
|
├── description
|
||||||
├── job_type (str): fulltime, parttime, internship, contract
|
├── job_type: fulltime, parttime, internship, contract
|
||||||
├── job_function (str)
|
├── job_function
|
||||||
├── compensation (object)
|
│ ├── interval: yearly, monthly, weekly, daily, hourly
|
||||||
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
│ ├── min_amount
|
||||||
│ ├── min_amount (int)
|
│ ├── max_amount
|
||||||
│ ├── max_amount (int)
|
│ ├── currency
|
||||||
│ └── currency (enum)
|
│ └── salary_source: direct_data, description (parsed from posting)
|
||||||
├── date_posted (date)
|
├── date_posted
|
||||||
├── emails (str)
|
├── emails
|
||||||
└── is_remote (bool)
|
└── is_remote
|
||||||
|
|
||||||
Linkedin specific
|
Linkedin specific
|
||||||
└── job_level (str)
|
└── job_level
|
||||||
|
|
||||||
Linkedin & Indeed specific
|
Linkedin & Indeed specific
|
||||||
└── company_industry (str)
|
└── company_industry
|
||||||
|
|
||||||
Indeed specific
|
Indeed specific
|
||||||
├── company_country (str)
|
├── company_country
|
||||||
└── company_addresses (str)
|
├── company_addresses
|
||||||
└── company_employees_label (str)
|
├── company_employees_label
|
||||||
└── company_revenue_label (str)
|
├── company_revenue_label
|
||||||
└── company_description (str)
|
├── company_description
|
||||||
└── ceo_name (str)
|
├── ceo_name
|
||||||
└── ceo_photo_url (str)
|
├── ceo_photo_url
|
||||||
└── logo_photo_url (str)
|
├── logo_photo_url
|
||||||
└── banner_photo_url (str)
|
└── banner_photo_url
|
||||||
```
|
```
|
||||||
|
|
||||||
## Supported Countries for Job Searching
|
## Supported Countries for Job Searching
|
||||||
|
|||||||
1228
poetry.lock
generated
1228
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
2
poetry.toml
Normal file
2
poetry.toml
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[virtualenvs]
|
||||||
|
in-project = true
|
||||||
@@ -1,10 +1,11 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.59"
|
version = "1.1.63"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
keywords = ['jobs-scraper', 'linkedin', 'indeed', 'glassdoor', 'ziprecruiter']
|
||||||
|
|
||||||
packages = [
|
packages = [
|
||||||
{ include = "jobspy", from = "src" }
|
{ include = "jobspy", from = "src" }
|
||||||
@@ -15,7 +16,7 @@ python = "^3.10"
|
|||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
beautifulsoup4 = "^4.12.2"
|
beautifulsoup4 = "^4.12.2"
|
||||||
pandas = "^2.1.0"
|
pandas = "^2.1.0"
|
||||||
NUMPY = "1.24.2"
|
NUMPY = "1.26.3"
|
||||||
pydantic = "^2.3.0"
|
pydantic = "^2.3.0"
|
||||||
tls-client = "^1.0.1"
|
tls-client = "^1.0.1"
|
||||||
markdownify = "^0.11.6"
|
markdownify = "^0.11.6"
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from .scrapers.indeed import IndeedScraper
|
|||||||
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from .scrapers.glassdoor import GlassdoorScraper
|
from .scrapers.glassdoor import GlassdoorScraper
|
||||||
from .scrapers.linkedin import LinkedInScraper
|
from .scrapers.linkedin import LinkedInScraper
|
||||||
from .scrapers import ScraperInput, Site, JobResponse, Country
|
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
||||||
from .scrapers.exceptions import (
|
from .scrapers.exceptions import (
|
||||||
LinkedInException,
|
LinkedInException,
|
||||||
IndeedException,
|
IndeedException,
|
||||||
@@ -36,6 +36,7 @@ def scrape_jobs(
|
|||||||
linkedin_company_ids: list[int] | None = None,
|
linkedin_company_ids: list[int] | None = None,
|
||||||
offset: int | None = 0,
|
offset: int | None = 0,
|
||||||
hours_old: int = None,
|
hours_old: int = None,
|
||||||
|
enforce_annual_salary: bool = False,
|
||||||
verbose: int = 2,
|
verbose: int = 2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
@@ -165,7 +166,8 @@ def scrape_jobs(
|
|||||||
job_data["min_amount"] = compensation_obj.get("min_amount")
|
job_data["min_amount"] = compensation_obj.get("min_amount")
|
||||||
job_data["max_amount"] = compensation_obj.get("max_amount")
|
job_data["max_amount"] = compensation_obj.get("max_amount")
|
||||||
job_data["currency"] = compensation_obj.get("currency", "USD")
|
job_data["currency"] = compensation_obj.get("currency", "USD")
|
||||||
if (
|
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
|
||||||
|
if enforce_annual_salary and (
|
||||||
job_data["interval"]
|
job_data["interval"]
|
||||||
and job_data["interval"] != "yearly"
|
and job_data["interval"] != "yearly"
|
||||||
and job_data["min_amount"]
|
and job_data["min_amount"]
|
||||||
@@ -180,8 +182,17 @@ def scrape_jobs(
|
|||||||
job_data["min_amount"],
|
job_data["min_amount"],
|
||||||
job_data["max_amount"],
|
job_data["max_amount"],
|
||||||
job_data["currency"],
|
job_data["currency"],
|
||||||
) = extract_salary(job_data["description"])
|
) = extract_salary(
|
||||||
|
job_data["description"],
|
||||||
|
enforce_annual_salary=enforce_annual_salary,
|
||||||
|
)
|
||||||
|
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
||||||
|
|
||||||
|
job_data["salary_source"] = (
|
||||||
|
job_data["salary_source"]
|
||||||
|
if "min_amount" in job_data and job_data["min_amount"]
|
||||||
|
else None
|
||||||
|
)
|
||||||
job_df = pd.DataFrame([job_data])
|
job_df = pd.DataFrame([job_data])
|
||||||
jobs_dfs.append(job_df)
|
jobs_dfs.append(job_df)
|
||||||
|
|
||||||
@@ -203,6 +214,7 @@ def scrape_jobs(
|
|||||||
"location",
|
"location",
|
||||||
"job_type",
|
"job_type",
|
||||||
"date_posted",
|
"date_posted",
|
||||||
|
"salary_source",
|
||||||
"interval",
|
"interval",
|
||||||
"min_amount",
|
"min_amount",
|
||||||
"max_amount",
|
"max_amount",
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ class Country(Enum):
|
|||||||
JAPAN = ("japan", "jp")
|
JAPAN = ("japan", "jp")
|
||||||
KUWAIT = ("kuwait", "kw")
|
KUWAIT = ("kuwait", "kw")
|
||||||
LUXEMBOURG = ("luxembourg", "lu")
|
LUXEMBOURG = ("luxembourg", "lu")
|
||||||
MALAYSIA = ("malaysia", "malaysia")
|
MALAYSIA = ("malaysia", "malaysia:my", "com")
|
||||||
MEXICO = ("mexico", "mx", "com.mx")
|
MEXICO = ("mexico", "mx", "com.mx")
|
||||||
MOROCCO = ("morocco", "ma")
|
MOROCCO = ("morocco", "ma")
|
||||||
NETHERLANDS = ("netherlands", "nl", "nl")
|
NETHERLANDS = ("netherlands", "nl", "nl")
|
||||||
|
|||||||
@@ -18,6 +18,9 @@ class Site(Enum):
|
|||||||
ZIP_RECRUITER = "zip_recruiter"
|
ZIP_RECRUITER = "zip_recruiter"
|
||||||
GLASSDOOR = "glassdoor"
|
GLASSDOOR = "glassdoor"
|
||||||
|
|
||||||
|
class SalarySource(Enum):
|
||||||
|
DIRECT_DATA = "direct_data"
|
||||||
|
DESCRIPTION = "description"
|
||||||
|
|
||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: list[Site]
|
site_type: list[Site]
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import numpy as np
|
|||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
from requests.adapters import HTTPAdapter, Retry
|
from requests.adapters import HTTPAdapter, Retry
|
||||||
|
|
||||||
from ..jobs import JobType
|
from ..jobs import CompensationInterval, JobType
|
||||||
|
|
||||||
logger = logging.getLogger("JobSpy")
|
logger = logging.getLogger("JobSpy")
|
||||||
logger.propagate = False
|
logger.propagate = False
|
||||||
@@ -193,10 +193,12 @@ def extract_salary(
|
|||||||
upper_limit=700000,
|
upper_limit=700000,
|
||||||
hourly_threshold=350,
|
hourly_threshold=350,
|
||||||
monthly_threshold=30000,
|
monthly_threshold=30000,
|
||||||
|
enforce_annual_salary=False,
|
||||||
):
|
):
|
||||||
if not salary_str:
|
if not salary_str:
|
||||||
return None, None, None, None
|
return None, None, None, None
|
||||||
|
|
||||||
|
annual_max_salary = None
|
||||||
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
|
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
|
||||||
|
|
||||||
def to_int(s):
|
def to_int(s):
|
||||||
@@ -220,20 +222,32 @@ def extract_salary(
|
|||||||
|
|
||||||
# Convert to annual if less than the hourly threshold
|
# Convert to annual if less than the hourly threshold
|
||||||
if min_salary < hourly_threshold:
|
if min_salary < hourly_threshold:
|
||||||
min_salary = convert_hourly_to_annual(min_salary)
|
interval = CompensationInterval.HOURLY.value
|
||||||
|
annual_min_salary = convert_hourly_to_annual(min_salary)
|
||||||
if max_salary < hourly_threshold:
|
if max_salary < hourly_threshold:
|
||||||
max_salary = convert_hourly_to_annual(max_salary)
|
annual_max_salary = convert_hourly_to_annual(max_salary)
|
||||||
|
|
||||||
elif min_salary < monthly_threshold:
|
elif min_salary < monthly_threshold:
|
||||||
min_salary = convert_monthly_to_annual(min_salary)
|
interval = CompensationInterval.MONTHLY.value
|
||||||
|
annual_min_salary = convert_monthly_to_annual(min_salary)
|
||||||
if max_salary < monthly_threshold:
|
if max_salary < monthly_threshold:
|
||||||
max_salary = convert_monthly_to_annual(max_salary)
|
annual_max_salary = convert_monthly_to_annual(max_salary)
|
||||||
|
|
||||||
|
else:
|
||||||
|
interval = CompensationInterval.YEARLY.value
|
||||||
|
annual_min_salary = min_salary
|
||||||
|
annual_max_salary = max_salary
|
||||||
|
|
||||||
# Ensure salary range is within specified limits
|
# Ensure salary range is within specified limits
|
||||||
|
if not annual_max_salary:
|
||||||
|
return None, None, None, None
|
||||||
if (
|
if (
|
||||||
lower_limit <= min_salary <= upper_limit
|
lower_limit <= annual_min_salary <= upper_limit
|
||||||
and lower_limit <= max_salary <= upper_limit
|
and lower_limit <= annual_max_salary <= upper_limit
|
||||||
and min_salary < max_salary
|
and annual_min_salary < annual_max_salary
|
||||||
):
|
):
|
||||||
return "yearly", min_salary, max_salary, "USD"
|
if enforce_annual_salary:
|
||||||
|
return interval, annual_min_salary, annual_max_salary, "USD"
|
||||||
|
else:
|
||||||
|
return interval, min_salary, max_salary, "USD"
|
||||||
return None, None, None, None
|
return None, None, None, None
|
||||||
|
|||||||
@@ -4,11 +4,15 @@ import pandas as pd
|
|||||||
|
|
||||||
def test_all():
|
def test_all():
|
||||||
result = scrape_jobs(
|
result = scrape_jobs(
|
||||||
site_name=["linkedin", "indeed", "zip_recruiter", "glassdoor"],
|
site_name=[
|
||||||
search_term="software engineer",
|
"linkedin",
|
||||||
|
"indeed",
|
||||||
|
"glassdoor",
|
||||||
|
], # ziprecruiter needs good ip, and temp fix to pass test on ci
|
||||||
|
search_term="engineer",
|
||||||
results_wanted=5,
|
results_wanted=5,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
isinstance(result, pd.DataFrame) and len(result) == 15
|
||||||
), "Result should be a non-empty DataFrame"
|
), "Result should be a non-empty DataFrame"
|
||||||
|
|||||||
@@ -2,10 +2,12 @@ from ..jobspy import scrape_jobs
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def test_indeed():
|
def test_glassdoor():
|
||||||
result = scrape_jobs(
|
result = scrape_jobs(
|
||||||
site_name="glassdoor", search_term="software engineer", country_indeed="USA"
|
site_name="glassdoor",
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
), "Result should be a non-empty DataFrame"
|
), "Result should be a non-empty DataFrame"
|
||||||
|
|||||||
@@ -4,8 +4,10 @@ import pandas as pd
|
|||||||
|
|
||||||
def test_indeed():
|
def test_indeed():
|
||||||
result = scrape_jobs(
|
result = scrape_jobs(
|
||||||
site_name="indeed", search_term="software engineer", country_indeed="usa"
|
site_name="indeed",
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
), "Result should be a non-empty DataFrame"
|
), "Result should be a non-empty DataFrame"
|
||||||
|
|||||||
@@ -3,10 +3,7 @@ import pandas as pd
|
|||||||
|
|
||||||
|
|
||||||
def test_linkedin():
|
def test_linkedin():
|
||||||
result = scrape_jobs(
|
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
|
||||||
site_name="linkedin",
|
|
||||||
search_term="software engineer",
|
|
||||||
)
|
|
||||||
assert (
|
assert (
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
), "Result should be a non-empty DataFrame"
|
), "Result should be a non-empty DataFrame"
|
||||||
|
|||||||
@@ -4,10 +4,9 @@ import pandas as pd
|
|||||||
|
|
||||||
def test_ziprecruiter():
|
def test_ziprecruiter():
|
||||||
result = scrape_jobs(
|
result = scrape_jobs(
|
||||||
site_name="zip_recruiter",
|
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
|
||||||
search_term="software engineer",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
), "Result should be a non-empty DataFrame"
|
), "Result should be a non-empty DataFrame"
|
||||||
|
|||||||
Reference in New Issue
Block a user