add offset param & email extraction (#51)

* add offset param * [enh]: extract emails
2026-03-04 19:44:30 -08:00 · 2023-09-28 18:11:28 -05:00
parent 286b9e1256
commit af07c1ecbd
17 changed files with 1209 additions and 1126 deletions
--- a/examples/JobSpy_Demo.ipynb
+++ b/examples/JobSpy_Demo.ipynb
@@ -0,0 +1,167 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00a94b47-f47b-420f-ba7e-714ef219c006",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from jobspy import scrape_jobs\n",
+    "import pandas as pd\n",
+    "from IPython.display import display, HTML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f773e6c-d9fc-42cc-b0ef-63b739e78435",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.max_columns', None)\n",
+    "pd.set_option('display.max_rows', None)\n",
+    "pd.set_option('display.width', None)\n",
+    "pd.set_option('display.max_colwidth', 50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1253c1f8-9437-492e-9dd3-e7fe51099420",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example 1 (no hyperlinks, USA)\n",
+    "jobs = scrape_jobs(\n",
+    "    site_name=[\"linkedin\"],\n",
+    "    location='san francisco',\n",
+    "    search_term=\"engineer\",\n",
+    "    results_wanted=5,\n",
+    "\n",
+    "    # use if you want to use a proxy\n",
+    "    # proxy=\"socks5://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
+    "    proxy=\"http://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
+    "    #proxy=\"https://jobspy:5a4vpWtj4EeJ2hoYzk@us.smartproxy.com:10001\",\n",
+    ")\n",
+    "display(jobs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a581b2d-f7da-4fac-868d-9efe143ee20a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example 2 - remote USA & hyperlinks\n",
+    "jobs = scrape_jobs(\n",
+    "    site_name=[\"linkedin\", \"zip_recruiter\", \"indeed\"],\n",
+    "    # location='san francisco',\n",
+    "    search_term=\"software engineer\",\n",
+    "    country_indeed=\"USA\",\n",
+    "    hyperlinks=True,\n",
+    "    is_remote=True,\n",
+    "    results_wanted=5, \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe8289bc-5b64-4202-9a64-7c117c83fd9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use if hyperlinks=True\n",
+    "html = jobs.to_html(escape=False)\n",
+    "# change max-width: 200px to show more or less of the content\n",
+    "truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
+    "display(HTML(truncate_width))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "951c2fe1-52ff-407d-8bb1-068049b36777",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example 3 - with hyperlinks, international - linkedin (no zip_recruiter)\n",
+    "jobs = scrape_jobs(\n",
+    "    site_name=[\"linkedin\"],\n",
+    "    location='berlin',\n",
+    "    search_term=\"engineer\",\n",
+    "    hyperlinks=True,\n",
+    "    results_wanted=5,\n",
+    "    easy_apply=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e37a521-caef-441c-8fc2-2eb5b2e7da62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use if hyperlinks=True\n",
+    "html = jobs.to_html(escape=False)\n",
+    "# change max-width: 200px to show more or less of the content\n",
+    "truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
+    "display(HTML(truncate_width))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0650e608-0b58-4bf5-ae86-68348035b16a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example 4 - international indeed (no zip_recruiter)\n",
+    "jobs = scrape_jobs(\n",
+    "    site_name=[\"indeed\"],\n",
+    "    search_term=\"engineer\",\n",
+    "    country_indeed = \"China\",\n",
+    "    hyperlinks=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40913ac8-3f8a-4d7e-ac47-afb88316432b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use if hyperlinks=True\n",
+    "html = jobs.to_html(escape=False)\n",
+    "# change max-width: 200px to show more or less of the content\n",
+    "truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
+    "display(HTML(truncate_width))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/JobSpy_Demo.py
+++ b/examples/JobSpy_Demo.py
@@ -0,0 +1,31 @@
+from jobspy import scrape_jobs
+import pandas as pd
+
+jobs: pd.DataFrame = scrape_jobs(
+    site_name=["indeed", "linkedin", "zip_recruiter"],
+    search_term="software engineer",
+    location="Dallas, TX",
+    results_wanted=50,  # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
+    country_indeed='USA',
+    offset=25  # start jobs from an offset (use if search failed and want to continue)
+    # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
+)
+
+# formatting for pandas
+pd.set_option('display.max_columns', None)
+pd.set_option('display.max_rows', None)
+pd.set_option('display.width', None)
+pd.set_option('display.max_colwidth', 50)  # set to 0 to see full job url / desc
+
+# 1: output to console
+print(jobs)
+
+# 2: output to .csv
+jobs.to_csv('./jobs.csv', index=False)
+print('outputted to jobs.csv')
+
+# 3: output to .xlsx
+# jobs.to_xlsx('jobs.xlsx', index=False)
+
+# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
+# display(jobs)