mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-09 15:04:33 -07:00
Compare commits
14 Commits
1.1.75
...
04032a0f91
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
04032a0f91 | ||
|
|
496896d0b5 | ||
|
|
87ba1ad1bf | ||
|
|
4e7ac9a583 | ||
|
|
e44d13e1cf | ||
|
|
d52e366ef7 | ||
|
|
395ebf0017 | ||
|
|
63fddd9b7f | ||
|
|
58956868ae | ||
|
|
4fce836222 | ||
|
|
5ba25e7a7c | ||
|
|
f7cb3e9206 | ||
|
|
3ad3f121f7 | ||
|
|
ff3c782912 |
39
.github/workflows/publish-to-pypi.yml
vendored
39
.github/workflows/publish-to-pypi.yml
vendored
@@ -1,33 +1,50 @@
|
|||||||
name: Publish Python 🐍 distributions 📦 to PyPI
|
name: Publish Python 🐍 distributions 📦 to PyPI
|
||||||
on: push
|
on:
|
||||||
|
pull_request:
|
||||||
|
types:
|
||||||
|
- closed
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-n-publish:
|
build-n-publish:
|
||||||
name: Build and publish Python 🐍 distributions 📦 to PyPI
|
name: Build and publish Python 🐍 distributions 📦 to PyPI
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.10"
|
python-version: "3.10"
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip install toml
|
||||||
|
|
||||||
|
- name: Increment version
|
||||||
|
run: python increment_version.py
|
||||||
|
|
||||||
|
- name: Commit version increment
|
||||||
|
run: |
|
||||||
|
git config --global user.name 'github-actions'
|
||||||
|
git config --global user.email 'github-actions@github.com'
|
||||||
|
git add pyproject.toml
|
||||||
|
git commit -m 'Increment version'
|
||||||
|
|
||||||
|
- name: Push changes
|
||||||
|
run: git push
|
||||||
|
|
||||||
- name: Install poetry
|
- name: Install poetry
|
||||||
run: >-
|
run: pip install poetry --user
|
||||||
python3 -m
|
|
||||||
pip install
|
|
||||||
poetry
|
|
||||||
--user
|
|
||||||
|
|
||||||
- name: Build distribution 📦
|
- name: Build distribution 📦
|
||||||
run: >-
|
run: poetry build
|
||||||
python3 -m
|
|
||||||
poetry
|
|
||||||
build
|
|
||||||
|
|
||||||
- name: Publish distribution 📦 to PyPI
|
- name: Publish distribution 📦 to PyPI
|
||||||
if: startsWith(github.ref, 'refs/tags')
|
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
109
README.md
109
README.md
@@ -8,8 +8,8 @@ work with us.*
|
|||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
|
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
|
||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a dataframe
|
||||||
- Proxies support
|
- Proxies support to bypass blocking
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@@ -33,12 +33,11 @@ jobs = scrape_jobs(
|
|||||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||||
location="San Francisco, CA",
|
location="San Francisco, CA",
|
||||||
results_wanted=20,
|
results_wanted=20,
|
||||||
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
hours_old=72,
|
||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA',
|
||||||
|
|
||||||
# linkedin_fetch_description=True # get more info such as full description, direct job url for linkedin (slower)
|
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
|
||||||
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
||||||
|
|
||||||
)
|
)
|
||||||
print(f"Found {len(jobs)} jobs")
|
print(f"Found {len(jobs)} jobs")
|
||||||
print(jobs.head())
|
print(jobs.head())
|
||||||
@@ -68,7 +67,7 @@ Optional
|
|||||||
├── search_term (str)
|
├── search_term (str)
|
||||||
|
|
|
|
||||||
├── google_search_term (str)
|
├── google_search_term (str)
|
||||||
| search term for google jobs. This is is only param for filtering google jobs.
|
| search term for google jobs. This is the only param for filtering google jobs.
|
||||||
│
|
│
|
||||||
├── location (str)
|
├── location (str)
|
||||||
│
|
│
|
||||||
@@ -88,7 +87,7 @@ Optional
|
|||||||
| number of job results to retrieve for each site specified in 'site_name'
|
| number of job results to retrieve for each site specified in 'site_name'
|
||||||
│
|
│
|
||||||
├── easy_apply (bool):
|
├── easy_apply (bool):
|
||||||
| filters for jobs that are hosted on the job board site
|
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
||||||
│
|
│
|
||||||
├── description_format (str):
|
├── description_format (str):
|
||||||
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
||||||
@@ -133,46 +132,6 @@ Optional
|
|||||||
| - easy_apply
|
| - easy_apply
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
### JobPost Schema
|
|
||||||
|
|
||||||
```plaintext
|
|
||||||
JobPost
|
|
||||||
├── title
|
|
||||||
├── company
|
|
||||||
├── company_url
|
|
||||||
├── job_url
|
|
||||||
├── location
|
|
||||||
│ ├── country
|
|
||||||
│ ├── city
|
|
||||||
│ ├── state
|
|
||||||
├── description
|
|
||||||
├── job_type: fulltime, parttime, internship, contract
|
|
||||||
├── job_function
|
|
||||||
│ ├── interval: yearly, monthly, weekly, daily, hourly
|
|
||||||
│ ├── min_amount
|
|
||||||
│ ├── max_amount
|
|
||||||
│ ├── currency
|
|
||||||
│ └── salary_source: direct_data, description (parsed from posting)
|
|
||||||
├── date_posted
|
|
||||||
├── emails
|
|
||||||
└── is_remote
|
|
||||||
|
|
||||||
Linkedin specific
|
|
||||||
└── job_level
|
|
||||||
|
|
||||||
Linkedin & Indeed specific
|
|
||||||
└── company_industry
|
|
||||||
|
|
||||||
Indeed specific
|
|
||||||
├── company_country
|
|
||||||
├── company_addresses
|
|
||||||
├── company_employees_label
|
|
||||||
├── company_revenue_label
|
|
||||||
├── company_description
|
|
||||||
└── company_logo
|
|
||||||
```
|
|
||||||
|
|
||||||
## Supported Countries for Job Searching
|
## Supported Countries for Job Searching
|
||||||
|
|
||||||
### **LinkedIn**
|
### **LinkedIn**
|
||||||
@@ -219,7 +178,18 @@ You can specify the following countries when searching on Indeed (use the exact
|
|||||||
|
|
||||||
---
|
---
|
||||||
**Q: Why is Indeed giving unrelated roles?**
|
**Q: Why is Indeed giving unrelated roles?**
|
||||||
**A:** Indeed is searching each one of your terms e.g. software intern, it searches software OR intern. Try search_term='"software intern"' in quotes for stricter searching
|
**A:** Indeed searches the description too.
|
||||||
|
|
||||||
|
- use - to remove words
|
||||||
|
- "" for exact match
|
||||||
|
|
||||||
|
Example of a good Indeed query
|
||||||
|
|
||||||
|
```py
|
||||||
|
search_term='"engineering intern" software summer (java OR python OR c++) 2025 -tax -marketing'
|
||||||
|
```
|
||||||
|
|
||||||
|
This searches the description/title and must include software, summer, 2025, one of the languages, engineering intern exactly, no tax, no marketing.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -231,8 +201,41 @@ You can specify the following countries when searching on Indeed (use the exact
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**Q: Encountering issues with your queries?**
|
### JobPost Schema
|
||||||
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
|
|
||||||
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
|
|
||||||
|
|
||||||
---
|
```plaintext
|
||||||
|
JobPost
|
||||||
|
├── title
|
||||||
|
├── company
|
||||||
|
├── company_url
|
||||||
|
├── job_url
|
||||||
|
├── location
|
||||||
|
│ ├── country
|
||||||
|
│ ├── city
|
||||||
|
│ ├── state
|
||||||
|
├── description
|
||||||
|
├── job_type: fulltime, parttime, internship, contract
|
||||||
|
├── job_function
|
||||||
|
│ ├── interval: yearly, monthly, weekly, daily, hourly
|
||||||
|
│ ├── min_amount
|
||||||
|
│ ├── max_amount
|
||||||
|
│ ├── currency
|
||||||
|
│ └── salary_source: direct_data, description (parsed from posting)
|
||||||
|
├── date_posted
|
||||||
|
├── emails
|
||||||
|
└── is_remote
|
||||||
|
|
||||||
|
Linkedin specific
|
||||||
|
└── job_level
|
||||||
|
|
||||||
|
Linkedin & Indeed specific
|
||||||
|
└── company_industry
|
||||||
|
|
||||||
|
Indeed specific
|
||||||
|
├── company_country
|
||||||
|
├── company_addresses
|
||||||
|
├── company_employees_label
|
||||||
|
├── company_revenue_label
|
||||||
|
├── company_description
|
||||||
|
└── company_logo
|
||||||
|
```
|
||||||
|
|||||||
21
increment_version.py
Normal file
21
increment_version.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import toml
|
||||||
|
|
||||||
|
def increment_version(version):
|
||||||
|
major, minor, patch = map(int, version.split('.'))
|
||||||
|
patch += 1
|
||||||
|
return f"{major}.{minor}.{patch}"
|
||||||
|
|
||||||
|
# Load pyproject.toml
|
||||||
|
with open('pyproject.toml', 'r') as file:
|
||||||
|
pyproject = toml.load(file)
|
||||||
|
|
||||||
|
# Increment the version
|
||||||
|
current_version = pyproject['tool']['poetry']['version']
|
||||||
|
new_version = increment_version(current_version)
|
||||||
|
pyproject['tool']['poetry']['version'] = new_version
|
||||||
|
|
||||||
|
# Save the updated pyproject.toml
|
||||||
|
with open('pyproject.toml', 'w') as file:
|
||||||
|
toml.dump(pyproject, file)
|
||||||
|
|
||||||
|
print(f"Version updated from {current_version} to {new_version}")
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
[virtualenvs]
|
|
||||||
in-project = true
|
|
||||||
@@ -1,15 +1,21 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = [ "poetry-core",]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.75"
|
version = "1.1.76"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = [ "Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>",]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = ['jobs-scraper', 'linkedin', 'indeed', 'glassdoor', 'ziprecruiter']
|
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",]
|
||||||
|
[[tool.poetry.packages]]
|
||||||
|
include = "jobspy"
|
||||||
|
from = "src"
|
||||||
|
|
||||||
packages = [
|
[tool.black]
|
||||||
{ include = "jobspy", from = "src" }
|
line-length = 88
|
||||||
]
|
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
@@ -22,16 +28,8 @@ tls-client = "^1.0.1"
|
|||||||
markdownify = "^0.13.1"
|
markdownify = "^0.13.1"
|
||||||
regex = "^2024.4.28"
|
regex = "^2024.4.28"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pytest = "^7.4.1"
|
pytest = "^7.4.1"
|
||||||
jupyter = "^1.0.0"
|
jupyter = "^1.0.0"
|
||||||
black = "*"
|
black = "*"
|
||||||
pre-commit = "*"
|
pre-commit = "*"
|
||||||
|
|
||||||
[build-system]
|
|
||||||
requires = ["poetry-core"]
|
|
||||||
build-backend = "poetry.core.masonry.api"
|
|
||||||
|
|
||||||
[tool.black]
|
|
||||||
line-length = 88
|
|
||||||
|
|||||||
@@ -232,7 +232,7 @@ class GoogleJobsScraper(Scraper):
|
|||||||
def _find_job_info_initial_page(html_text: str):
|
def _find_job_info_initial_page(html_text: str):
|
||||||
pattern = (
|
pattern = (
|
||||||
f'520084652":('
|
f'520084652":('
|
||||||
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
|
+ r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
||||||
)
|
)
|
||||||
results = []
|
results = []
|
||||||
matches = re.finditer(pattern, html_text)
|
matches = re.finditer(pattern, html_text)
|
||||||
|
|||||||
Reference in New Issue
Block a user