mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
add offset param & email extraction (#51)
* add offset param * [enh]: extract emails
This commit is contained in:
167
examples/JobSpy_Demo.ipynb
Normal file
167
examples/JobSpy_Demo.ipynb
Normal file
@@ -0,0 +1,167 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00a94b47-f47b-420f-ba7e-714ef219c006",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from jobspy import scrape_jobs\n",
|
||||
"import pandas as pd\n",
|
||||
"from IPython.display import display, HTML"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9f773e6c-d9fc-42cc-b0ef-63b739e78435",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.set_option('display.max_columns', None)\n",
|
||||
"pd.set_option('display.max_rows', None)\n",
|
||||
"pd.set_option('display.width', None)\n",
|
||||
"pd.set_option('display.max_colwidth', 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1253c1f8-9437-492e-9dd3-e7fe51099420",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# example 1 (no hyperlinks, USA)\n",
|
||||
"jobs = scrape_jobs(\n",
|
||||
" site_name=[\"linkedin\"],\n",
|
||||
" location='san francisco',\n",
|
||||
" search_term=\"engineer\",\n",
|
||||
" results_wanted=5,\n",
|
||||
"\n",
|
||||
" # use if you want to use a proxy\n",
|
||||
" # proxy=\"socks5://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
|
||||
" proxy=\"http://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
|
||||
" #proxy=\"https://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
|
||||
")\n",
|
||||
"display(jobs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6a581b2d-f7da-4fac-868d-9efe143ee20a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# example 2 - remote USA & hyperlinks\n",
|
||||
"jobs = scrape_jobs(\n",
|
||||
" site_name=[\"linkedin\", \"zip_recruiter\", \"indeed\"],\n",
|
||||
" # location='san francisco',\n",
|
||||
" search_term=\"software engineer\",\n",
|
||||
" country_indeed=\"USA\",\n",
|
||||
" hyperlinks=True,\n",
|
||||
" is_remote=True,\n",
|
||||
" results_wanted=5, \n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fe8289bc-5b64-4202-9a64-7c117c83fd9a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# use if hyperlinks=True\n",
|
||||
"html = jobs.to_html(escape=False)\n",
|
||||
"# change max-width: 200px to show more or less of the content\n",
|
||||
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
|
||||
"display(HTML(truncate_width))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "951c2fe1-52ff-407d-8bb1-068049b36777",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# example 3 - with hyperlinks, international - linkedin (no zip_recruiter)\n",
|
||||
"jobs = scrape_jobs(\n",
|
||||
" site_name=[\"linkedin\"],\n",
|
||||
" location='berlin',\n",
|
||||
" search_term=\"engineer\",\n",
|
||||
" hyperlinks=True,\n",
|
||||
" results_wanted=5,\n",
|
||||
" easy_apply=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1e37a521-caef-441c-8fc2-2eb5b2e7da62",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# use if hyperlinks=True\n",
|
||||
"html = jobs.to_html(escape=False)\n",
|
||||
"# change max-width: 200px to show more or less of the content\n",
|
||||
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
|
||||
"display(HTML(truncate_width))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0650e608-0b58-4bf5-ae86-68348035b16a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# example 4 - international indeed (no zip_recruiter)\n",
|
||||
"jobs = scrape_jobs(\n",
|
||||
" site_name=[\"indeed\"],\n",
|
||||
" search_term=\"engineer\",\n",
|
||||
" country_indeed = \"China\",\n",
|
||||
" hyperlinks=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "40913ac8-3f8a-4d7e-ac47-afb88316432b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# use if hyperlinks=True\n",
|
||||
"html = jobs.to_html(escape=False)\n",
|
||||
"# change max-width: 200px to show more or less of the content\n",
|
||||
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
|
||||
"display(HTML(truncate_width))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
31
examples/JobSpy_Demo.py
Normal file
31
examples/JobSpy_Demo.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from jobspy import scrape_jobs
|
||||
import pandas as pd
|
||||
|
||||
jobs: pd.DataFrame = scrape_jobs(
|
||||
site_name=["indeed", "linkedin", "zip_recruiter"],
|
||||
search_term="software engineer",
|
||||
location="Dallas, TX",
|
||||
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
|
||||
country_indeed='USA',
|
||||
offset=25 # start jobs from an offset (use if search failed and want to continue)
|
||||
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
||||
)
|
||||
|
||||
# formatting for pandas
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.max_rows', None)
|
||||
pd.set_option('display.width', None)
|
||||
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
|
||||
|
||||
# 1: output to console
|
||||
print(jobs)
|
||||
|
||||
# 2: output to .csv
|
||||
jobs.to_csv('./jobs.csv', index=False)
|
||||
print('outputted to jobs.csv')
|
||||
|
||||
# 3: output to .xlsx
|
||||
# jobs.to_xlsx('jobs.xlsx', index=False)
|
||||
|
||||
# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
|
||||
# display(jobs)
|
||||
Reference in New Issue
Block a user