Compare commits

...

16 Commits

Author SHA1 Message Date
Cullen Watson
558e352939 fix: job type param bug 2023-09-21 17:42:24 -05:00
Zachary Hampton
efad1a1b7d Update README.md 2023-09-21 09:52:18 -07:00
Cullen Watson
eaa481c2f4 docs: add macos catalina to faq 2023-09-19 12:50:14 -05:00
Zachary Hampton
b914aa6449 Update README.md 2023-09-16 13:52:30 -07:00
Zachary Hampton
6adbfb8b29 Update README.md 2023-09-16 13:51:45 -07:00
Zachary Hampton
a3b9dd50ff (docs) homepage 2023-09-15 16:14:26 -07:00
Zachary Hampton
d3ba3a4878 docs: sales call 2023-09-15 11:51:22 -07:00
Cullen Watson
f524789d74 docs: grammar readme 2023-09-15 10:18:24 -05:00
Cullen Watson
f3890d4830 docs: update 2023-09-09 10:55:33 -05:00
Cullen Watson
60c9728691 docs: typo 2023-09-08 12:27:49 -05:00
Cullen Watson
f79d975e5f docs: clarify - README.md 2023-09-07 13:46:14 -05:00
Cullen Watson
d6368f909b docs: typo 2023-09-07 13:39:56 -05:00
Cullen Watson
6fcf7f666e docs: update typo in example 2023-09-07 13:37:53 -05:00
Cullen Watson
4406f9350f docs: update vid 2023-09-07 13:35:10 -05:00
Cullen Watson
ca5155f234 docs: add feature 2023-09-07 11:36:16 -05:00
Cullen Watson
822a55783e docs: temp update 2023-09-07 11:35:14 -05:00
4 changed files with 53 additions and 37 deletions

View File

@@ -9,7 +9,7 @@
"source": [ "source": [
"from jobspy import scrape_jobs\n", "from jobspy import scrape_jobs\n",
"import pandas as pd\n", "import pandas as pd\n",
"from IPython.display import display, HTML\n" "from IPython.display import display, HTML"
] ]
}, },
{ {
@@ -34,18 +34,16 @@
"source": [ "source": [
"# example 1 (no hyperlinks, USA)\n", "# example 1 (no hyperlinks, USA)\n",
"jobs = scrape_jobs(\n", "jobs = scrape_jobs(\n",
" site_name=[\"linkedin\", \"zip_recruiter\"],\n", " site_name=[\"linkedin\"],\n",
" location='san francisco',\n", " location='san francisco',\n",
" search_term=\"engineer\",\n", " search_term=\"engineer\",\n",
" results_wanted=5,\n", " results_wanted=5,\n",
"\n", "\n",
" # use if you want to use a proxy\n", " # use if you want to use a proxy\n",
" # proxy=\"socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n", " # proxy=\"socks5://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
" # proxy=\"http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n", " proxy=\"http://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
" # proxy=\"https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n", " #proxy=\"https://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
"\n",
")\n", ")\n",
"\n",
"display(jobs)" "display(jobs)"
] ]
}, },
@@ -97,9 +95,6 @@
" hyperlinks=True,\n", " hyperlinks=True,\n",
" results_wanted=5,\n", " results_wanted=5,\n",
" easy_apply=True\n", " easy_apply=True\n",
"\n",
"\n",
"\n",
")" ")"
] ]
}, },
@@ -125,11 +120,10 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# example 4 - international indeed (no zip_recruiter)\n", "# example 4 - international indeed (no zip_recruiter)\n",
"result = scrape_jobs(\n", "jobs = scrape_jobs(\n",
" site_name=[\"indeed\"],\n", " site_name=[\"indeed\"],\n",
" location='berlin',\n",
" search_term=\"engineer\",\n", " search_term=\"engineer\",\n",
" country_indeed = \"Germany\",\n", " country_indeed = \"China\",\n",
" hyperlinks=True\n", " hyperlinks=True\n",
")" ")"
] ]
@@ -165,7 +159,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.11" "version": "3.11.5"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -1,21 +1,24 @@
<img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400"> <img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400">
**JobSpy** is a simple, yet comprehensive, job scraping library. **JobSpy** is a simple, yet comprehensive, job scraping library.
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.*
\
Check out another project we wrote: ***[HomeHarvest](https://github.com/ZacharyHampton/HomeHarvest)** a Python package for real estate scraping*
## Features ## Features
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame - Aggregates the job postings in a Pandas DataFrame
- Proxy support (HTTP/S, SOCKS)
[Video Guide for JobSpy](https://www.youtube.com/watch?v=-yS3mgI5H-4) [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - Updated for release v1.1.3
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57) ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
### Installation ### Installation
``` ```
pip install python-jobspy pip install --upgrade python-jobspy
``` ```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
@@ -34,27 +37,26 @@ jobs: pd.DataFrame = scrape_jobs(
country_indeed='USA' # only needed for indeed country_indeed='USA' # only needed for indeed
# use if you want to use a proxy # use if you want to use a proxy (3 types)
# proxy="socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# proxy="https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
) )
# formatting for pandas
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None) pd.set_option('display.max_rows', None)
pd.set_option('display.width', None) pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
#1 output #1 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
print(jobs) display(jobs)
print(errors)
#2 display in Jupyter Notebook #2 output to console
#display(jobs) #print(jobs)
#display(errors)
#3 output to .csv #3 output to .csv
#result.jobs.to_csv('result.jobs.csv', index=False) #jobs.to_csv('jobs.csv', index=False)
``` ```
### Output ### Output
@@ -76,10 +78,11 @@ Optional
├── location (int) ├── location (int)
├── distance (int): in miles ├── distance (int): in miles
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
├── is_remote (bool) ├── is_remote (bool)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn ├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
├── country_indeed (enum): filters the country on Indeed ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
``` ```
@@ -103,20 +106,26 @@ JobPost
└── date_posted (date) └── date_posted (date)
``` ```
### Exceptions
The following exceptions may be raised when using JobSpy:
* `LinkedInException`
* `IndeedException`
* `ZipRecruiterException`
## Supported Countries for Job Searching ## Supported Countries for Job Searching
### **LinkedIn** ### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter LinkedIn searches globally & uses only the `location` parameter.
### **ZipRecruiter** ### **ZipRecruiter**
ZipRecruiter searches for jobs in US/Canada & uses only the `location` parameter ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed** ### **Indeed**
For Indeed, the `country_indeed` parameter is required. Additionally, use the `location` parameter and include the city or state if necessary. Indeed supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` parameter to narrow down the location, e.g. city & state if necessary.
You can specify the following countries when searching on Indeed (use the exact name): You can specify the following countries when searching on Indeed (use the exact name):
@@ -145,18 +154,22 @@ You can specify the following countries when searching on Indeed (use the exact
--- ---
**Q: Encountering issues with your queries?** **Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems persist, [submit an issue](#). **A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems persist, [submit an issue](https://github.com/cullenwatson/JobSpy/issues).
--- ---
**Q: Received a response code 429?** **Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. Currently, **ZipRecruiter** is particularly aggressive with blocking. We recommend: **A:** This indicates that you have been blocked by the job board site for sending too many requests. Currently, **LinkedIn** is particularly aggressive with blocking. We recommend:
- Waiting a few seconds between requests. - Waiting a few seconds between requests.
- Trying a VPN to change your IP address. - Trying a VPN or proxy to change your IP address.
**Note:** Proxy support is in development and coming soon!
--- ---
**Q: Experiencing a "Segmentation fault: 11" on macOS Catalina?**
**A:** This is due to `tls_client` dependency not supporting your architecture. Solutions and workarounds include:
- Upgrade to a newer version of MacOS
- Reach out to the maintainers of [tls_client](https://github.com/bogdanfinn/tls-client) for fixes

View File

@@ -1,8 +1,9 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.3" version = "1.1.5"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy"
readme = "README.md" readme = "README.md"
packages = [ packages = [

View File

@@ -32,7 +32,7 @@ def scrape_jobs(
location: str = "", location: str = "",
distance: int = None, distance: int = None,
is_remote: bool = False, is_remote: bool = False,
job_type: JobType = None, job_type: str = None,
easy_apply: bool = False, # linkedin easy_apply: bool = False, # linkedin
results_wanted: int = 15, results_wanted: int = 15,
country_indeed: str = "usa", country_indeed: str = "usa",
@@ -44,6 +44,14 @@ def scrape_jobs(
:return: results_wanted: pandas dataframe containing job data :return: results_wanted: pandas dataframe containing job data
""" """
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return job_type
raise Exception(f"Invalid job type: {value_str}")
job_type = get_enum_from_value(job_type) if job_type else None
if type(site_name) == str: if type(site_name) == str:
site_type = [_map_str_to_site(site_name)] site_type = [_map_str_to_site(site_name)]
else: #: if type(site_name) == list else: #: if type(site_name) == list