Compare commits

..

8 Commits

Author SHA1 Message Date
Cullen Watson
286f0e4487 docs:readme 2024-09-18 18:49:41 -05:00
Cullen Watson
f7b29d43a2 fix(indeed):sort relevance not date (#197) 2024-09-18 18:42:25 -05:00
Cullen Watson
6f1490458c fix key error (#186) 2024-08-14 02:54:40 -05:00
Cullen Watson
6bb7d81ba8 change linkedin ep (#185) 2024-08-14 02:39:43 -05:00
Cullen Watson
0e046432d1 fix:variable bug (#181) 2024-08-05 12:47:55 -05:00
Cullen Watson
209e0e65b6 fix:malaysia indeed (#180) 2024-08-03 22:48:53 -05:00
Cullen Watson
8570c0651e fix:key error (#176) 2024-07-21 13:05:18 -05:00
Cullen Watson
8678b0bbe4 enh: test on pr (#174) 2024-07-19 14:25:25 -05:00
15 changed files with 65 additions and 27 deletions

22
.github/workflows/python-test.yml vendored Normal file
View File

@@ -0,0 +1,22 @@
name: Python Tests
on:
pull_request:
branches:
- main
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
pip install poetry
poetry install
- name: Run tests
run: poetry run pytest src/tests/test_all.py

View File

@@ -37,7 +37,7 @@ jobs = scrape_jobs(
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor
# linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
# linkedin_fetch_description=True # get more info such as full description, direct job url for linkedin (slower)
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
)
@@ -216,10 +216,8 @@ You can specify the following countries when searching on Indeed (use the exact
## Frequently Asked Questions
---
**Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
**Q: Why is Indeed giving unrelated roles?**
**A:** Indeed is searching each one of your terms e.g. software intern, it searches software OR intern. Try search_term='"software intern"' in quotes for stricter searching
---
@@ -230,3 +228,9 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
- Try using the proxies param to change your IP address.
---
**Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
---

2
poetry.toml Normal file
View File

@@ -0,0 +1,2 @@
[virtualenvs]
in-project = true

View File

@@ -1,10 +1,11 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.60"
version = "1.1.66"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md"
keywords = ['jobs-scraper', 'linkedin', 'indeed', 'glassdoor', 'ziprecruiter']
packages = [
{ include = "jobspy", from = "src" }

View File

@@ -189,7 +189,9 @@ def scrape_jobs(
job_data["salary_source"] = SalarySource.DESCRIPTION.value
job_data["salary_source"] = (
job_data["salary_source"] if job_data["min_amount"] else None
job_data["salary_source"]
if "min_amount" in job_data and job_data["min_amount"]
else None
)
job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df)

View File

@@ -92,7 +92,7 @@ class Country(Enum):
JAPAN = ("japan", "jp")
KUWAIT = ("kuwait", "kw")
LUXEMBOURG = ("luxembourg", "lu")
MALAYSIA = ("malaysia", "malaysia")
MALAYSIA = ("malaysia", "malaysia:my", "com")
MEXICO = ("mexico", "mx", "com.mx")
MOROCCO = ("morocco", "ma")
NETHERLANDS = ("netherlands", "nl", "nl")

View File

@@ -364,8 +364,8 @@ class IndeedScraper(Scraper):
{what}
{location}
limit: 100
sort: DATE
{cursor}
sort: RELEVANCE
{filters}
) {{
pageInfo {{

View File

@@ -236,7 +236,7 @@ class LinkedInScraper(Scraper):
"""
try:
response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/jobPosting/{job_id}", timeout=5
f"{self.base_url}/jobs/view/{job_id}", timeout=5
)
response.raise_for_status()
except:

View File

@@ -198,6 +198,7 @@ def extract_salary(
if not salary_str:
return None, None, None, None
annual_max_salary = None
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
def to_int(s):
@@ -238,6 +239,8 @@ def extract_salary(
annual_max_salary = max_salary
# Ensure salary range is within specified limits
if not annual_max_salary:
return None, None, None, None
if (
lower_limit <= annual_min_salary <= upper_limit
and lower_limit <= annual_max_salary <= upper_limit

View File

@@ -200,7 +200,7 @@ class ZipRecruiterScraper(Scraper):
script_tag = soup.find("script", type="application/json")
if script_tag:
job_json = json.loads(script_tag.string)
job_url_val = job_json["model"]["saveJobURL"]
job_url_val = job_json["model"].get("saveJobURL", "")
m = re.search(r"job_url=(.+)", job_url_val)
if m:
job_url_direct = m.group(1)

View File

@@ -4,11 +4,15 @@ import pandas as pd
def test_all():
result = scrape_jobs(
site_name=["linkedin", "indeed", "zip_recruiter", "glassdoor"],
search_term="software engineer",
site_name=[
"linkedin",
"indeed",
"glassdoor",
], # ziprecruiter needs good ip, and temp fix to pass test on ci
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and not result.empty
isinstance(result, pd.DataFrame) and len(result) == 15
), "Result should be a non-empty DataFrame"

View File

@@ -2,10 +2,12 @@ from ..jobspy import scrape_jobs
import pandas as pd
def test_indeed():
def test_glassdoor():
result = scrape_jobs(
site_name="glassdoor", search_term="software engineer", country_indeed="USA"
site_name="glassdoor",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and not result.empty
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -4,8 +4,10 @@ import pandas as pd
def test_indeed():
result = scrape_jobs(
site_name="indeed", search_term="software engineer", country_indeed="usa"
site_name="indeed",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and not result.empty
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -3,10 +3,7 @@ import pandas as pd
def test_linkedin():
result = scrape_jobs(
site_name="linkedin",
search_term="software engineer",
)
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
assert (
isinstance(result, pd.DataFrame) and not result.empty
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -4,10 +4,9 @@ import pandas as pd
def test_ziprecruiter():
result = scrape_jobs(
site_name="zip_recruiter",
search_term="software engineer",
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and not result.empty
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"