mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
286b9e1256 | ||
|
|
162dd40b0f | ||
|
|
558e352939 | ||
|
|
efad1a1b7d | ||
|
|
eaa481c2f4 | ||
|
|
b914aa6449 | ||
|
|
6adbfb8b29 | ||
|
|
a3b9dd50ff | ||
|
|
d3ba3a4878 | ||
|
|
f524789d74 | ||
|
|
f3890d4830 | ||
|
|
60c9728691 | ||
|
|
f79d975e5f | ||
|
|
d6368f909b | ||
|
|
6fcf7f666e | ||
|
|
4406f9350f | ||
|
|
ca5155f234 | ||
|
|
822a55783e |
@@ -9,7 +9,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from jobspy import scrape_jobs\n",
|
"from jobspy import scrape_jobs\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from IPython.display import display, HTML\n"
|
"from IPython.display import display, HTML"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -34,18 +34,16 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# example 1 (no hyperlinks, USA)\n",
|
"# example 1 (no hyperlinks, USA)\n",
|
||||||
"jobs = scrape_jobs(\n",
|
"jobs = scrape_jobs(\n",
|
||||||
" site_name=[\"linkedin\", \"zip_recruiter\"],\n",
|
" site_name=[\"linkedin\"],\n",
|
||||||
" location='san francisco',\n",
|
" location='san francisco',\n",
|
||||||
" search_term=\"engineer\",\n",
|
" search_term=\"engineer\",\n",
|
||||||
" results_wanted=5,\n",
|
" results_wanted=5,\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # use if you want to use a proxy\n",
|
" # use if you want to use a proxy\n",
|
||||||
" # proxy=\"socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
|
" # proxy=\"socks5://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
|
||||||
" # proxy=\"http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
|
" proxy=\"http://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
|
||||||
" # proxy=\"https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
|
" #proxy=\"https://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
|
||||||
"\n",
|
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
|
||||||
"display(jobs)"
|
"display(jobs)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -97,9 +95,6 @@
|
|||||||
" hyperlinks=True,\n",
|
" hyperlinks=True,\n",
|
||||||
" results_wanted=5,\n",
|
" results_wanted=5,\n",
|
||||||
" easy_apply=True\n",
|
" easy_apply=True\n",
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -125,11 +120,10 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# example 4 - international indeed (no zip_recruiter)\n",
|
"# example 4 - international indeed (no zip_recruiter)\n",
|
||||||
"result = scrape_jobs(\n",
|
"jobs = scrape_jobs(\n",
|
||||||
" site_name=[\"indeed\"],\n",
|
" site_name=[\"indeed\"],\n",
|
||||||
" location='berlin',\n",
|
|
||||||
" search_term=\"engineer\",\n",
|
" search_term=\"engineer\",\n",
|
||||||
" country_indeed = \"Germany\",\n",
|
" country_indeed = \"China\",\n",
|
||||||
" hyperlinks=True\n",
|
" hyperlinks=True\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@@ -165,7 +159,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.11"
|
"version": "3.11.5"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
57
README.md
57
README.md
@@ -1,21 +1,26 @@
|
|||||||
<img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400">
|
<img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400">
|
||||||
|
|
||||||
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
||||||
|
|
||||||
|
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
|
||||||
|
|
||||||
|
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
|
||||||
|
\
|
||||||
|
Check out another project we wrote: ***[HomeHarvest](https://github.com/ZacharyHampton/HomeHarvest)** – a Python package for real estate scraping*
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
|
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
|
||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a Pandas DataFrame
|
||||||
|
- Proxy support (HTTP/S, SOCKS)
|
||||||
|
|
||||||
[Video Guide for JobSpy](https://www.youtube.com/watch?v=-yS3mgI5H-4)
|
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - Updated for release v1.1.3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
```
|
```
|
||||||
pip install python-jobspy
|
pip install --upgrade python-jobspy
|
||||||
```
|
```
|
||||||
|
|
||||||
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
||||||
@@ -34,27 +39,26 @@ jobs: pd.DataFrame = scrape_jobs(
|
|||||||
|
|
||||||
country_indeed='USA' # only needed for indeed
|
country_indeed='USA' # only needed for indeed
|
||||||
|
|
||||||
# use if you want to use a proxy
|
# use if you want to use a proxy (3 types)
|
||||||
# proxy="socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
# proxy="socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
||||||
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
||||||
# proxy="https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
# proxy="https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# formatting for pandas
|
||||||
pd.set_option('display.max_columns', None)
|
pd.set_option('display.max_columns', None)
|
||||||
pd.set_option('display.max_rows', None)
|
pd.set_option('display.max_rows', None)
|
||||||
pd.set_option('display.width', None)
|
pd.set_option('display.width', None)
|
||||||
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
|
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
|
||||||
|
|
||||||
#1 output
|
#1 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
|
||||||
print(jobs)
|
display(jobs)
|
||||||
print(errors)
|
|
||||||
|
|
||||||
#2 display in Jupyter Notebook
|
#2 output to console
|
||||||
#display(jobs)
|
#print(jobs)
|
||||||
#display(errors)
|
|
||||||
|
|
||||||
#3 output to .csv
|
#3 output to .csv
|
||||||
#result.jobs.to_csv('result.jobs.csv', index=False)
|
#jobs.to_csv('jobs.csv', index=False)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Output
|
### Output
|
||||||
@@ -76,10 +80,11 @@ Optional
|
|||||||
├── location (int)
|
├── location (int)
|
||||||
├── distance (int): in miles
|
├── distance (int): in miles
|
||||||
├── job_type (enum): fulltime, parttime, internship, contract
|
├── job_type (enum): fulltime, parttime, internship, contract
|
||||||
|
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||||
├── is_remote (bool)
|
├── is_remote (bool)
|
||||||
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
||||||
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
|
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
|
||||||
├── country_indeed (enum): filters the country on Indeed
|
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@@ -103,20 +108,26 @@ JobPost
|
|||||||
└── date_posted (date)
|
└── date_posted (date)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Exceptions
|
||||||
|
The following exceptions may be raised when using JobSpy:
|
||||||
|
* `LinkedInException`
|
||||||
|
* `IndeedException`
|
||||||
|
* `ZipRecruiterException`
|
||||||
|
|
||||||
## Supported Countries for Job Searching
|
## Supported Countries for Job Searching
|
||||||
|
|
||||||
|
|
||||||
### **LinkedIn**
|
### **LinkedIn**
|
||||||
|
|
||||||
LinkedIn searches globally & uses only the `location` parameter
|
LinkedIn searches globally & uses only the `location` parameter.
|
||||||
|
|
||||||
### **ZipRecruiter**
|
### **ZipRecruiter**
|
||||||
|
|
||||||
ZipRecruiter searches for jobs in US/Canada & uses only the `location` parameter
|
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
|
||||||
|
|
||||||
|
|
||||||
### **Indeed**
|
### **Indeed**
|
||||||
For Indeed, the `country_indeed` parameter is required. Additionally, use the `location` parameter and include the city or state if necessary.
|
Indeed supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` parameter to narrow down the location, e.g. city & state if necessary.
|
||||||
|
|
||||||
You can specify the following countries when searching on Indeed (use the exact name):
|
You can specify the following countries when searching on Indeed (use the exact name):
|
||||||
|
|
||||||
@@ -145,18 +156,22 @@ You can specify the following countries when searching on Indeed (use the exact
|
|||||||
---
|
---
|
||||||
|
|
||||||
**Q: Encountering issues with your queries?**
|
**Q: Encountering issues with your queries?**
|
||||||
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems persist, [submit an issue](#).
|
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems persist, [submit an issue](https://github.com/cullenwatson/JobSpy/issues).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**Q: Received a response code 429?**
|
**Q: Received a response code 429?**
|
||||||
**A:** This indicates that you have been blocked by the job board site for sending too many requests. Currently, **ZipRecruiter** is particularly aggressive with blocking. We recommend:
|
**A:** This indicates that you have been blocked by the job board site for sending too many requests. Currently, **LinkedIn** is particularly aggressive with blocking. We recommend:
|
||||||
|
|
||||||
- Waiting a few seconds between requests.
|
- Waiting a few seconds between requests.
|
||||||
- Trying a VPN to change your IP address.
|
- Trying a VPN or proxy to change your IP address.
|
||||||
|
|
||||||
**Note:** Proxy support is in development and coming soon!
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
**Q: Experiencing a "Segmentation fault: 11" on macOS Catalina?**
|
||||||
|
**A:** This is due to `tls_client` dependency not supporting your architecture. Solutions and workarounds include:
|
||||||
|
- Upgrade to a newer version of MacOS
|
||||||
|
- Reach out to the maintainers of [tls_client](https://github.com/bogdanfinn/tls-client) for fixes
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.3"
|
version = "1.1.7"
|
||||||
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||||
|
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
packages = [
|
packages = [
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ def scrape_jobs(
|
|||||||
location: str = "",
|
location: str = "",
|
||||||
distance: int = None,
|
distance: int = None,
|
||||||
is_remote: bool = False,
|
is_remote: bool = False,
|
||||||
job_type: JobType = None,
|
job_type: str = None,
|
||||||
easy_apply: bool = False, # linkedin
|
easy_apply: bool = False, # linkedin
|
||||||
results_wanted: int = 15,
|
results_wanted: int = 15,
|
||||||
country_indeed: str = "usa",
|
country_indeed: str = "usa",
|
||||||
@@ -44,6 +44,14 @@ def scrape_jobs(
|
|||||||
:return: results_wanted: pandas dataframe containing job data
|
:return: results_wanted: pandas dataframe containing job data
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def get_enum_from_value(value_str):
|
||||||
|
for job_type in JobType:
|
||||||
|
if value_str in job_type.value:
|
||||||
|
return job_type
|
||||||
|
raise Exception(f"Invalid job type: {value_str}")
|
||||||
|
job_type = get_enum_from_value(job_type) if job_type else None
|
||||||
|
|
||||||
|
|
||||||
if type(site_name) == str:
|
if type(site_name) == str:
|
||||||
site_type = [_map_str_to_site(site_name)]
|
site_type = [_map_str_to_site(site_name)]
|
||||||
else: #: if type(site_name) == list
|
else: #: if type(site_name) == list
|
||||||
|
|||||||
Reference in New Issue
Block a user