fix bugs in migration

pull/31/head
Cullen Watson 2023-09-03 09:25:00 -05:00
parent 5131cf49b2
commit 922dda51e6
16 changed files with 2934 additions and 424 deletions

4
.gitignore vendored
View File

@ -5,5 +5,5 @@
**/__pycache__/ **/__pycache__/
*.pyc *.pyc
.env .env
client_secret.json dist
dist /.ipynb_checkpoints/

View File

@ -1,15 +0,0 @@
FROM python:3.10-slim
WORKDIR /app
COPY . /app
RUN apt-get update && \
apt-get install -y jq && \
pip install --no-cache-dir -r requirements.txt
EXPOSE 8000
ENV PORT=8000
CMD sh -c "uvicorn main:app --host 0.0.0.0 --port $PORT"

702
JobSpy_Demo.ipynb Normal file
View File

@ -0,0 +1,702 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "c3f21577-477d-451e-9914-5d67e8a89075",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>site</th>\n",
" <th>title</th>\n",
" <th>company_name</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>job_type</th>\n",
" <th>interval</th>\n",
" <th>min_amount</th>\n",
" <th>max_amount</th>\n",
" <th>job_url</th>\n",
" <th>description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>indeed</td>\n",
" <td>Firmware Engineer</td>\n",
" <td>Advanced Motion Controls</td>\n",
" <td>Camarillo</td>\n",
" <td>CA</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>145000</td>\n",
" <td>110000</td>\n",
" <td>https://www.indeed.com/viewjob?jk=a2e7077fdd3c...</td>\n",
" <td>We are looking for an experienced Firmware Eng...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>indeed</td>\n",
" <td>Computer Engineer</td>\n",
" <td>Honeywell</td>\n",
" <td></td>\n",
" <td>None</td>\n",
" <td>fulltime</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.indeed.com/viewjob?jk=5a1da623ee75...</td>\n",
" <td>Join a team recognized for leadership, innovat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>indeed</td>\n",
" <td>Software Engineer</td>\n",
" <td>Splunk</td>\n",
" <td>Remote</td>\n",
" <td>None</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>159500</td>\n",
" <td>116000</td>\n",
" <td>https://www.indeed.com/viewjob?jk=155495ca3f46...</td>\n",
" <td>A little about us. Splunk is the key to enterp...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>indeed</td>\n",
" <td>Development Operations Engineer</td>\n",
" <td>Stratacache</td>\n",
" <td>Dayton</td>\n",
" <td>OH</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>90000</td>\n",
" <td>83573</td>\n",
" <td>https://www.indeed.com/viewjob?jk=77cf3540c06e...</td>\n",
" <td>Stratacache, Inc. delivers in-store retail exp...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>indeed</td>\n",
" <td>Computer Engineer</td>\n",
" <td>Honeywell</td>\n",
" <td></td>\n",
" <td>None</td>\n",
" <td>fulltime</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.indeed.com/viewjob?jk=7fadbb7c936f...</td>\n",
" <td>Join a team recognized for leadership, innovat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>indeed</td>\n",
" <td>Full Stack Developer</td>\n",
" <td>Reinventing Geospatial, Inc. (RGi)</td>\n",
" <td>Herndon</td>\n",
" <td>VA</td>\n",
" <td>fulltime</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.indeed.com/viewjob?jk=11b2b5b0dd44...</td>\n",
" <td>Job Highlights As a Full Stack Software Engine...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>indeed</td>\n",
" <td>Software Engineer</td>\n",
" <td>Workiva</td>\n",
" <td>Remote</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>yearly</td>\n",
" <td>134000</td>\n",
" <td>79000</td>\n",
" <td>https://www.indeed.com/viewjob?jk=ec3ab6eb9253...</td>\n",
" <td>Are you ready to embark on an exciting journey...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>indeed</td>\n",
" <td>Senior Software Engineer</td>\n",
" <td>SciTec</td>\n",
" <td>Boulder</td>\n",
" <td>CO</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>164000</td>\n",
" <td>93000</td>\n",
" <td>https://www.indeed.com/viewjob?jk=781e4cf0cf6d...</td>\n",
" <td>SciTec has been awarded multiple government co...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>indeed</td>\n",
" <td>Software Engineer</td>\n",
" <td>Microsoft</td>\n",
" <td></td>\n",
" <td>None</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>182600</td>\n",
" <td>94300</td>\n",
" <td>https://www.indeed.com/viewjob?jk=21e05b9e9d96...</td>\n",
" <td>At Microsoft we are seeking people who have a ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>indeed</td>\n",
" <td>Software Engineer</td>\n",
" <td>Avalon Healthcare Solutions</td>\n",
" <td>Remote</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.indeed.com/viewjob?jk=da35b9bb74a0...</td>\n",
" <td>Avalon Healthcare Solutions, headquartered in ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer</td>\n",
" <td>Fieldguide</td>\n",
" <td>San Francisco</td>\n",
" <td>CA</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3696158160</td>\n",
" <td>About us:Fieldguide is establishing a new stat...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer - Early Career</td>\n",
" <td>Lockheed Martin</td>\n",
" <td>Sunnyvale</td>\n",
" <td>CA</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3693012711</td>\n",
" <td>Description:By bringing together people that u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer - Early Career</td>\n",
" <td>Lockheed Martin</td>\n",
" <td>Edwards</td>\n",
" <td>CA</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3700669785</td>\n",
" <td>Description:By bringing together people that u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer - Early Career</td>\n",
" <td>Lockheed Martin</td>\n",
" <td>Fort Worth</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3701775201</td>\n",
" <td>Description:By bringing together people that u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer - Early Career</td>\n",
" <td>Lockheed Martin</td>\n",
" <td>Fort Worth</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3701772329</td>\n",
" <td>Description:By bringing together people that u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer - Early Career</td>\n",
" <td>Lockheed Martin</td>\n",
" <td>Fort Worth</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3701769637</td>\n",
" <td>Description:By bringing together people that u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer</td>\n",
" <td>SpiderOak</td>\n",
" <td>Austin</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3707174719</td>\n",
" <td>We're only as strong as our weakest link.In th...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer - Early Career</td>\n",
" <td>Lockheed Martin</td>\n",
" <td>Fort Worth</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3701770659</td>\n",
" <td>Description:By bringing together people that u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>linkedin</td>\n",
" <td>Full-Stack Software Engineer</td>\n",
" <td>Rain</td>\n",
" <td>New York</td>\n",
" <td>NY</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3696158877</td>\n",
" <td>Rains mission is to create the fastest and ea...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>linkedin</td>\n",
" <td>Software Engineer</td>\n",
" <td>Nike</td>\n",
" <td>Portland</td>\n",
" <td>OR</td>\n",
" <td>contract</td>\n",
" <td>yearly</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://www.linkedin.com/jobs/view/3693340247</td>\n",
" <td>Work options: FlexibleWe consider remote, on-p...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>zip_recruiter</td>\n",
" <td>(USA) Software Engineer III - Prototype Engine...</td>\n",
" <td>Walmart</td>\n",
" <td>Dallas</td>\n",
" <td>TX</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>https://click.appcast.io/track/hcgsw4k?cs=ngp&amp;...</td>\n",
" <td>We are currently seeking a highly skilled and ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Software Engineer - New Grad</td>\n",
" <td>ZipRecruiter</td>\n",
" <td>Santa Monica</td>\n",
" <td>CA</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>130000</td>\n",
" <td>150000</td>\n",
" <td>https://www.ziprecruiter.com/jobs/ziprecruiter...</td>\n",
" <td>We offer a hybrid work environment. Most US-ba...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Software Developer</td>\n",
" <td>Robert Half</td>\n",
" <td>Corpus Christi</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>105000</td>\n",
" <td>115000</td>\n",
" <td>https://www.ziprecruiter.com/jobs/robert-half-...</td>\n",
" <td>Robert Half has an opening for a Software Deve...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Software Engineer</td>\n",
" <td>Advantage Technical</td>\n",
" <td>Ontario</td>\n",
" <td>CA</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>100000</td>\n",
" <td>150000</td>\n",
" <td>https://www.ziprecruiter.com/jobs/advantage-te...</td>\n",
" <td>New career opportunity available with major Ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Software Developer</td>\n",
" <td>Robert Half</td>\n",
" <td>Tucson</td>\n",
" <td>AZ</td>\n",
" <td>temporary</td>\n",
" <td>hourly</td>\n",
" <td>47</td>\n",
" <td>55</td>\n",
" <td>https://www.ziprecruiter.com/jobs/robert-half-...</td>\n",
" <td>Robert Half is accepting inquiries for a SQL S...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Full Stack Software Engineer</td>\n",
" <td>ZipRecruiter</td>\n",
" <td>Phoenix</td>\n",
" <td>AZ</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>105000</td>\n",
" <td>145000</td>\n",
" <td>https://www.ziprecruiter.com/jobs/ziprecruiter...</td>\n",
" <td>We offer a hybrid work environment. Most US-ba...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Software Developer IV</td>\n",
" <td>Kforce Inc.</td>\n",
" <td>Mountain View</td>\n",
" <td>CA</td>\n",
" <td>contract</td>\n",
" <td>hourly</td>\n",
" <td>55</td>\n",
" <td>75</td>\n",
" <td>https://www.kforce.com/Jobs/job.aspx?job=1696~...</td>\n",
" <td>Kforce has a client that is seeking a Software...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Software Developer | Onsite | Omaha, NE - Omaha</td>\n",
" <td>OneStaff Medical</td>\n",
" <td>Omaha</td>\n",
" <td>NE</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>60000</td>\n",
" <td>110000</td>\n",
" <td>https://www.ziprecruiter.com/jobs/onestaff-med...</td>\n",
" <td>Company Description: We are looking for a well...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Senior Software Engineer</td>\n",
" <td>RightStaff, Inc.</td>\n",
" <td>Dallas</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>120000</td>\n",
" <td>180000</td>\n",
" <td>https://www.ziprecruiter.com/jobs/rightstaff-i...</td>\n",
" <td>Job Description:We are seeking a talented and ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>zip_recruiter</td>\n",
" <td>Software Developer - .Net Core - 12886</td>\n",
" <td>Walker Elliott</td>\n",
" <td>Dallas</td>\n",
" <td>TX</td>\n",
" <td>fulltime</td>\n",
" <td>yearly</td>\n",
" <td>105000</td>\n",
" <td>130000</td>\n",
" <td>https://www.ziprecruiter.com/jobs/walker-ellio...</td>\n",
" <td>Our highly successful DFW based client has bee...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" site title \\\n",
"0 indeed Firmware Engineer \n",
"1 indeed Computer Engineer \n",
"2 indeed Software Engineer \n",
"3 indeed Development Operations Engineer \n",
"4 indeed Computer Engineer \n",
"5 indeed Full Stack Developer \n",
"6 indeed Software Engineer \n",
"7 indeed Senior Software Engineer \n",
"8 indeed Software Engineer \n",
"9 indeed Software Engineer \n",
"10 linkedin Software Engineer \n",
"11 linkedin Software Engineer - Early Career \n",
"12 linkedin Software Engineer - Early Career \n",
"13 linkedin Software Engineer - Early Career \n",
"14 linkedin Software Engineer - Early Career \n",
"15 linkedin Software Engineer - Early Career \n",
"16 linkedin Software Engineer \n",
"17 linkedin Software Engineer - Early Career \n",
"18 linkedin Full-Stack Software Engineer \n",
"19 linkedin Software Engineer \n",
"20 zip_recruiter (USA) Software Engineer III - Prototype Engine... \n",
"21 zip_recruiter Software Engineer - New Grad \n",
"22 zip_recruiter Software Developer \n",
"23 zip_recruiter Software Engineer \n",
"24 zip_recruiter Software Developer \n",
"25 zip_recruiter Full Stack Software Engineer \n",
"26 zip_recruiter Software Developer IV \n",
"27 zip_recruiter Software Developer | Onsite | Omaha, NE - Omaha \n",
"28 zip_recruiter Senior Software Engineer \n",
"29 zip_recruiter Software Developer - .Net Core - 12886 \n",
"\n",
" company_name city state job_type \\\n",
"0 Advanced Motion Controls Camarillo CA fulltime \n",
"1 Honeywell None fulltime \n",
"2 Splunk Remote None fulltime \n",
"3 Stratacache Dayton OH fulltime \n",
"4 Honeywell None fulltime \n",
"5 Reinventing Geospatial, Inc. (RGi) Herndon VA fulltime \n",
"6 Workiva Remote None None \n",
"7 SciTec Boulder CO fulltime \n",
"8 Microsoft None fulltime \n",
"9 Avalon Healthcare Solutions Remote None None \n",
"10 Fieldguide San Francisco CA fulltime \n",
"11 Lockheed Martin Sunnyvale CA fulltime \n",
"12 Lockheed Martin Edwards CA fulltime \n",
"13 Lockheed Martin Fort Worth TX fulltime \n",
"14 Lockheed Martin Fort Worth TX fulltime \n",
"15 Lockheed Martin Fort Worth TX fulltime \n",
"16 SpiderOak Austin TX fulltime \n",
"17 Lockheed Martin Fort Worth TX fulltime \n",
"18 Rain New York NY fulltime \n",
"19 Nike Portland OR contract \n",
"20 Walmart Dallas TX None \n",
"21 ZipRecruiter Santa Monica CA fulltime \n",
"22 Robert Half Corpus Christi TX fulltime \n",
"23 Advantage Technical Ontario CA fulltime \n",
"24 Robert Half Tucson AZ temporary \n",
"25 ZipRecruiter Phoenix AZ fulltime \n",
"26 Kforce Inc. Mountain View CA contract \n",
"27 OneStaff Medical Omaha NE fulltime \n",
"28 RightStaff, Inc. Dallas TX fulltime \n",
"29 Walker Elliott Dallas TX fulltime \n",
"\n",
" interval min_amount max_amount \\\n",
"0 yearly 145000 110000 \n",
"1 None None None \n",
"2 yearly 159500 116000 \n",
"3 yearly 90000 83573 \n",
"4 None None None \n",
"5 None None None \n",
"6 yearly 134000 79000 \n",
"7 yearly 164000 93000 \n",
"8 yearly 182600 94300 \n",
"9 None None None \n",
"10 yearly None None \n",
"11 yearly None None \n",
"12 yearly None None \n",
"13 yearly None None \n",
"14 yearly None None \n",
"15 yearly None None \n",
"16 yearly None None \n",
"17 yearly None None \n",
"18 yearly None None \n",
"19 yearly None None \n",
"20 None None None \n",
"21 yearly 130000 150000 \n",
"22 yearly 105000 115000 \n",
"23 yearly 100000 150000 \n",
"24 hourly 47 55 \n",
"25 yearly 105000 145000 \n",
"26 hourly 55 75 \n",
"27 yearly 60000 110000 \n",
"28 yearly 120000 180000 \n",
"29 yearly 105000 130000 \n",
"\n",
" job_url \\\n",
"0 https://www.indeed.com/viewjob?jk=a2e7077fdd3c... \n",
"1 https://www.indeed.com/viewjob?jk=5a1da623ee75... \n",
"2 https://www.indeed.com/viewjob?jk=155495ca3f46... \n",
"3 https://www.indeed.com/viewjob?jk=77cf3540c06e... \n",
"4 https://www.indeed.com/viewjob?jk=7fadbb7c936f... \n",
"5 https://www.indeed.com/viewjob?jk=11b2b5b0dd44... \n",
"6 https://www.indeed.com/viewjob?jk=ec3ab6eb9253... \n",
"7 https://www.indeed.com/viewjob?jk=781e4cf0cf6d... \n",
"8 https://www.indeed.com/viewjob?jk=21e05b9e9d96... \n",
"9 https://www.indeed.com/viewjob?jk=da35b9bb74a0... \n",
"10 https://www.linkedin.com/jobs/view/3696158160 \n",
"11 https://www.linkedin.com/jobs/view/3693012711 \n",
"12 https://www.linkedin.com/jobs/view/3700669785 \n",
"13 https://www.linkedin.com/jobs/view/3701775201 \n",
"14 https://www.linkedin.com/jobs/view/3701772329 \n",
"15 https://www.linkedin.com/jobs/view/3701769637 \n",
"16 https://www.linkedin.com/jobs/view/3707174719 \n",
"17 https://www.linkedin.com/jobs/view/3701770659 \n",
"18 https://www.linkedin.com/jobs/view/3696158877 \n",
"19 https://www.linkedin.com/jobs/view/3693340247 \n",
"20 https://click.appcast.io/track/hcgsw4k?cs=ngp&... \n",
"21 https://www.ziprecruiter.com/jobs/ziprecruiter... \n",
"22 https://www.ziprecruiter.com/jobs/robert-half-... \n",
"23 https://www.ziprecruiter.com/jobs/advantage-te... \n",
"24 https://www.ziprecruiter.com/jobs/robert-half-... \n",
"25 https://www.ziprecruiter.com/jobs/ziprecruiter... \n",
"26 https://www.kforce.com/Jobs/job.aspx?job=1696~... \n",
"27 https://www.ziprecruiter.com/jobs/onestaff-med... \n",
"28 https://www.ziprecruiter.com/jobs/rightstaff-i... \n",
"29 https://www.ziprecruiter.com/jobs/walker-ellio... \n",
"\n",
" description \n",
"0 We are looking for an experienced Firmware Eng... \n",
"1 Join a team recognized for leadership, innovat... \n",
"2 A little about us. Splunk is the key to enterp... \n",
"3 Stratacache, Inc. delivers in-store retail exp... \n",
"4 Join a team recognized for leadership, innovat... \n",
"5 Job Highlights As a Full Stack Software Engine... \n",
"6 Are you ready to embark on an exciting journey... \n",
"7 SciTec has been awarded multiple government co... \n",
"8 At Microsoft we are seeking people who have a ... \n",
"9 Avalon Healthcare Solutions, headquartered in ... \n",
"10 About us:Fieldguide is establishing a new stat... \n",
"11 Description:By bringing together people that u... \n",
"12 Description:By bringing together people that u... \n",
"13 Description:By bringing together people that u... \n",
"14 Description:By bringing together people that u... \n",
"15 Description:By bringing together people that u... \n",
"16 We're only as strong as our weakest link.In th... \n",
"17 Description:By bringing together people that u... \n",
"18 Rains mission is to create the fastest and ea... \n",
"19 Work options: FlexibleWe consider remote, on-p... \n",
"20 We are currently seeking a highly skilled and ... \n",
"21 We offer a hybrid work environment. Most US-ba... \n",
"22 Robert Half has an opening for a Software Deve... \n",
"23 New career opportunity available with major Ma... \n",
"24 Robert Half is accepting inquiries for a SQL S... \n",
"25 We offer a hybrid work environment. Most US-ba... \n",
"26 Kforce has a client that is seeking a Software... \n",
"27 Company Description: We are looking for a well... \n",
"28 Job Description:We are seeking a talented and ... \n",
"29 Our highly successful DFW based client has bee... "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from jobscrape import scrape_jobs\n",
"import pandas as pd\n",
"\n",
"jobs: pd.DataFrame = scrape_jobs(\n",
" site_name=[\"indeed\", \"linkedin\", \"zip_recruiter\"],\n",
" search_term=\"software engineer\",\n",
" results_wanted=10\n",
")\n",
"\n",
"if jobs.empty:\n",
" print(\"No jobs found.\")\n",
"else:\n",
"\n",
" #1 print\n",
" pd.set_option('display.max_columns', None)\n",
" pd.set_option('display.max_rows', None)\n",
" pd.set_option('display.width', None)\n",
" pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc\n",
" print(jobs)\n",
"\n",
" #2 display in Jupyter Notebook\n",
" display(jobs)\n",
"\n",
" #3 output to csv\n",
" jobs.to_csv('jobs.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "efd667ef-fdf0-452a-b5e5-ce6825755be7",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1574dc17-0a42-4655-964f-5c03a6d3deb0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "my-poetry-env",
"language": "python",
"name": "my-poetry-env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

290
README.md
View File

@ -1,240 +1,100 @@
# JobSpy AIO Scraper # JobSpy
**JobSpy** is a simple, yet comprehensive, job scraping library.
## Features ## Features
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
- Returns jobs as JSON or CSV with title, location, company, description & other data - Aggregates the job postings in a Pandas DataFrame
- Imports directly into **Google Sheets**
- Optional JWT authorization
![jobspy_gsheet](https://github.com/cullenwatson/JobSpy/assets/78247585/9f0a997c-4e33-4167-b04e-31ab1f606edb) ### Installation
`pip install jobscrape`
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
### Usage
```python
from jobscrape import scrape_jobs
import pandas as pd
jobs: pd.DataFrame = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"],
search_term="software engineer",
results_wanted=10
)
if jobs.empty:
print("No jobs found.")
else:
#1 print
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
print(jobs)
#2 display in Jupyter Notebook
display(jobs)
#3 output to csv
jobs.to_csv('jobs.csv', index=False)
```
### Output
```
site title company_name city state job_type interval min_amount max_amount job_url description
indeed Software Engineer AMERICAN SYSTEMS Arlington VA None yearly 200000 150000 https://www.indeed.com/viewjob?jk=5e409e577046... THIS POSITION COMES WITH A 10K SIGNING BONUS! ...
indeed Senior Software Engineer TherapyNotes.com Philadelphia PA fulltime yearly 135000 110000 https://www.indeed.com/viewjob?jk=da39574a40cb... About Us TherapyNotes is the national leader i...
linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale CA fulltime yearly None None https://www.linkedin.com/jobs/view/3693012711 Description:By bringing together people that u...
linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rains mission is to create the fastest and ea...
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme.```
```
### Parameters for `scrape_jobs()`
### API
POST `/api/v1/jobs/`
### Request Schema
```plaintext ```plaintext
Required Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed ├── site_type (List[enum]): linkedin, zip_recruiter, indeed
└── search_term (str) └── search_term (str)
Optional Optional
├── location (int) ├── location (int)
├── distance (int) ├── distance (int): in miles
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── is_remote (bool) ├── is_remote (bool)
├── results_wanted (int): per site_type ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): only for linkedin ├── easy_apply (bool): filters for jobs on LinkedIn that have the 'Easy Apply' option
└── output_format (enum): json, csv, gsheet
```
### Request Example
```json
"site_type": ["indeed", "linkedin"],
"search_term": "software engineer",
"location": "austin, tx",
"distance": 10,
"job_type": "fulltime",
"results_wanted": 15
"output_format": "gsheet"
``` ```
### Response Schema ### Response Schema
```plaintext ```plaintext
site_type (enum): JobPost
JobResponse ├── title (str)
├── success (bool) ├── company_name (str)
├── error (str) ├── job_url (str)
├── jobs (List[JobPost]) ├── location (object)
│ └── JobPost │ ├── country (str)
│ ├── title (str) │ ├── city (str)
│ ├── company_name (str) │ ├── state (str)
│ ├── job_url (str) ├── description (str)
│ ├── location (object) ├── job_type (enum)
│ │ ├── country (str) ├── compensation (object)
│ │ ├── city (str) │ ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly
│ │ ├── state (str) │ ├── min_amount (float)
│ ├── description (str) │ ├── max_amount (float)
│ ├── job_type (enum) │ └── currency (str)
│ ├── compensation (object) └── date_posted (datetime)
│ │ ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly
│ │ ├── min_amount (float)
│ │ ├── max_amount (float)
│ │ └── currency (str)
│ └── date_posted (datetime)
├── total_results (int)
└── returned_results (int)
```
### Response Example (GOOGLE SHEETS)
```json
{
"status": "Successfully uploaded to Google Sheets",
"error": null,
"linkedin": null,
"indeed": null,
"zip_recruiter": null
}
```
### Response Example (JSON)
```json
{
"indeed": {
"success": true,
"error": null,
"jobs": [
{
"title": "Software Engineer",
"company_name": "INTEL",
"job_url": "https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228",
"location": {
"country": "USA",
"city": "Austin",
"state": "TX",
},
"description": "Job Description Designs, develops, tests, and debugs..."
"job_type": "fulltime",
"compensation": {
"interval": "yearly",
"min_amount": 209760.0,
"max_amount": 139480.0,
"currency": "USD"
},
"date_posted": "2023-08-18T00:00:00"
}, ...
],
"total_results": 845,
"returned_results": 15
},
"linkedin": {
"success": true,
"error": null,
"jobs": [
{
"title": "Software Engineer 1",
"company_name": "Public Partnerships | PPL",
"job_url": "https://www.linkedin.com/jobs/view/3690013792",
"location": {
"country": "USA",
"city": "Austin",
"state": "TX",
},
"description": "Public Partnerships LLC supports individuals with disabilities..."
"job_type": null,
"compensation": null,
"date_posted": "2023-07-31T00:00:00"
}, ...
],
"total_results": 2000,
"returned_results": 15
}
}
```
### Response Example (CSV)
```
Site, Title, Company Name, Job URL, Country, City, State, Job Type, Compensation Interval, Min Amount, Max Amount, Currency, Date Posted, Description
indeed, Software Engineer, INTEL, https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228, USA, Austin, TX, fulltime, yearly, 209760.0, 139480.0, USD, 2023-08-18T00:00:00, Job Description Designs...
linkedin, Software Engineer 1, Public Partnerships | PPL, https://www.linkedin.com/jobs/view/3690013792, USA, Austin, TX, , , , , , 2023-07-31T00:00:00, Public Partnerships LLC supports...
``` ```
## Installation
### Docker Setup
_Requires [Docker Desktop](https://www.docker.com/products/docker-desktop/)_
[JobSpy API Image](https://ghcr.io/cullenwatson/jobspy:latest) is continuously updated and available on GitHub Container Registry. ### FAQ
To pull the Docker image: #### Encountering issues with your queries?
```bash
docker pull ghcr.io/cullenwatson/jobspy:latest
```
#### Params Try reducing the number of `results_wanted` and/or broadening the filters. If problems persist, please submit an issue.
By default: #### Received a response code 429?
* Port: `8000` This means you've been blocked by the job board site for sending too many requests. Consider waiting a few seconds, or try using a VPN. Proxy support coming soon.
* Google sheet name: `JobSpy`
* Relative path of `client_secret.json` (for Google Sheets, see below to obtain)
To run the image with these default settings, use:
Example (Cmd Prompt - Windows):
```bash
docker run -v %cd%/client_secret.json:/app/client_secret.json -p 8000:8000 ghcr.io/cullenwatson/jobspy
```
Example (Unix):
```bash
docker run -v $(pwd)/client_secret.json:/app/client_secret.json -p 8000:8000 ghcr.io/cullenwatson/jobspy
```
#### Using custom params
Example:
* Port: `8030`
* Google sheet name: `CustomName`
* Absolute path of `client_secret.json`: `C:\config\client_secret.json`
To pass these custom params:
```bash
docker run -v C:\config\client_secret.json:/app/client_secret.json -e GSHEET_NAME=CustomName -e PORT=8030 -p 8030:8030 ghcr.io/cullenwatson/jobspy
```
### Python installation (alternative to Docker)
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
1. Clone this repository `git clone https://github.com/cullenwatson/jobspy`
2. Install the dependencies with `pip install -r requirements.txt`
4. Run the server with `uvicorn main:app --reload`
### Google Sheets Setup
#### Obtaining an Access Key: [Video Guide](https://youtu.be/w533wJuilao?si=5u3m50pRtdhqkg9Z&t=43)
* Enable the [Google Sheets & Google Drive API](https://console.cloud.google.com/)
* Create credentials -> service account -> create & continue
* Select role -> basic: editor -> done
* Click on the email you just created in the service account list
* Go to the Keys tab -> add key -> create new key -> JSON -> Create
#### Using the key in the repo
* Copy the key file into the JobSpy repo as `client_secret.json`
* Go to [my template sheet](https://docs.google.com/spreadsheets/d/1mOgb-ZGZy_YIhnW9OCqIVvkFwiKFvhMBjNcbakW7BLo/edit?usp=sharing): File -> Make a Copy -> Rename to JobSpy
* Share the Google sheet with the email located in the field `client_email` in the `client_secret.json` above with editor rights
* If you changed the name of the sheet:
- Python install: add `.env` in the repo and add `GSHEET_NAME` param with the sheet name as the value, e.g. `GSHEET_NAME=CustomName`
- Docker install: use custom param `-e GSHEET_NAME=CustomName` in `docker run` (see above)
### How to call the API
#### [Postman](https://www.postman.com/downloads/) (preferred):
To use Postman:
1. Locate the files in the `/postman/` directory.
2. Import the Postman collection and environment JSON files.
#### Swagger UI:
Or you can call the API with the interactive documentation at [localhost:8000/docs](http://localhost:8000/docs).
## FAQ
### I'm having issues with my queries. What should I do?
Try reducing the number of `results_wanted` and/or broadening the filters. If issues still persist, feel free to submit an issue.
### I'm getting response code 429. What should I do?
You have been blocked by the job board site for sending too many requests. Wait a couple seconds or use a VPN.
### How to enable auth?
Change `AUTH_REQUIRED` in `/settings.py` to `True`
The auth uses [supabase](https://supabase.com). Create a project with a `users` table and disable RLS.
<img src="https://github.com/cullenwatson/jobspy/assets/78247585/03af18e1-5386-49ad-a2cf-d34232d9d747" width="500">
Add these three environment variables:
- `SUPABASE_URL`: go to project settings -> API -> Project URL
- `SUPABASE_KEY`: go to project settings -> API -> service_role secret
- `JWT_SECRET_KEY` - type `openssl rand -hex 32` in terminal to create a 32 byte secret key
Use these endpoints to register and get an access token:
![image](https://github.com/cullenwatson/jobspy/assets/78247585/c84c33ec-1fe8-4152-9c8c-6c4334aecfc3)

121
jobscrape/__init__.py Normal file
View File

@ -0,0 +1,121 @@
import pandas as pd
from typing import List, Dict, Tuple, Union
from concurrent.futures import ThreadPoolExecutor
from .core.jobs import JobType
from .core.scrapers.indeed import IndeedScraper
from .core.scrapers.ziprecruiter import ZipRecruiterScraper
from .core.scrapers.linkedin import LinkedInScraper
from .core.scrapers import (
ScraperInput,
Site,
JobResponse,
CommonResponse,
)
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
}
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def scrape_jobs(
site_name: str | Site | List[Site],
search_term: str,
location: str = "",
distance: int = None,
is_remote: bool = False,
job_type: JobType = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15
) -> pd.DataFrame:
"""
Asynchronously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
"""
if type(site_name) == str:
site_name = _map_str_to_site(site_name)
site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput(
site_type=site_type,
search_term=search_term,
location=location,
distance=distance,
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
results_wanted=results_wanted,
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
return site.value, scraped_data
results = {}
for site in scraper_input.site_type:
site_value, scraped_data = scrape_site(site)
results[site_value] = scraped_data
dfs = []
for site, job_response in results.items():
for job in job_response.jobs:
data = job.dict()
data['site'] = site
# Formatting JobType
data['job_type'] = data['job_type'].value if data['job_type'] else None
# Formatting Location
location_obj = data.get('location')
if location_obj and isinstance(location_obj, dict):
data['city'] = location_obj.get('city', '')
data['state'] = location_obj.get('state', '')
data['country'] = location_obj.get('country', 'USA')
else:
data['city'] = None
data['state'] = None
data['country'] = None
# Formatting Compensation
compensation_obj = data.get('compensation')
if compensation_obj and isinstance(compensation_obj, dict):
data['interval'] = compensation_obj.get('interval').value if compensation_obj.get('interval') else None
data['min_amount'] = compensation_obj.get('min_amount')
data['max_amount'] = compensation_obj.get('max_amount')
data['currency'] = compensation_obj.get('currency', 'USD')
else:
data['interval'] = None
data['min_amount'] = None
data['max_amount'] = None
data['currency'] = None
job_df = pd.DataFrame([data])
dfs.append(job_df)
if dfs:
df = pd.concat(dfs, ignore_index=True)
desired_order = ['site', 'title', 'company_name', 'city', 'state','job_type',
'interval', 'min_amount', 'max_amount', 'job_url', 'description',]
df = df[desired_order]
else:
df = pd.DataFrame()
return df

View File

@ -1,4 +1,4 @@
from typing import Union from typing import Union, Optional
from datetime import date from datetime import date
from enum import Enum from enum import Enum
@ -19,10 +19,11 @@ class JobType(Enum):
VOLUNTEER = "volunteer" VOLUNTEER = "volunteer"
class Location(BaseModel): class Location(BaseModel):
country: str = "USA" country: str = "USA"
city: str = "" city: str = None
state: str = "" state: Optional[str] = None
class CompensationInterval(Enum): class CompensationInterval(Enum):
@ -35,8 +36,8 @@ class CompensationInterval(Enum):
class Compensation(BaseModel): class Compensation(BaseModel):
interval: CompensationInterval interval: CompensationInterval
min_amount: int min_amount: int = None
max_amount: int max_amount: int = None
currency: str = "USD" currency: str = "USD"
@ -44,11 +45,11 @@ class JobPost(BaseModel):
title: str title: str
company_name: str company_name: str
job_url: str job_url: str
location: Location location: Optional[Location]
description: str = None description: str = None
job_type: JobType = None job_type: Optional[JobType] = None
compensation: Compensation = None compensation: Optional[Compensation] = None
date_posted: date = None date_posted: date = None
@ -56,7 +57,7 @@ class JobResponse(BaseModel):
success: bool success: bool
error: str = None error: str = None
total_results: int = None total_results: Optional[int] = None
jobs: list[JobPost] = [] jobs: list[JobPost] = []
@ -64,6 +65,11 @@ class JobResponse(BaseModel):
@validator("returned_results", pre=True, always=True) @validator("returned_results", pre=True, always=True)
def set_returned_results(cls, v, values): def set_returned_results(cls, v, values):
if v is None and values.get("jobs"): jobs_list = values.get("jobs")
return len(values["jobs"])
if v is None:
if jobs_list is not None:
return len(jobs_list)
else:
return 0
return v return v

View File

@ -18,9 +18,9 @@ class ScraperInput(BaseModel):
search_term: str search_term: str
location: str = None location: str = None
distance: int = None distance: Optional[int] = None
is_remote: bool = False is_remote: bool = False
job_type: JobType = None job_type: Optional[JobType] = None
easy_apply: bool = None # linkedin easy_apply: bool = None # linkedin
results_wanted: int = 15 results_wanted: int = 15

View File

@ -1,21 +1,19 @@
import re import re
import sys
import math
import json import json
from typing import Optional, Tuple, List
from datetime import datetime from datetime import datetime
from typing import Optional, Tuple, List
import tls_client import tls_client
import urllib.parse import urllib.parse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
from .. import Scraper, ScraperInput, Site, StatusException from .. import Scraper, ScraperInput, Site, StatusException
from concurrent.futures import ThreadPoolExecutor, Future
import math
import traceback
import sys
class ParsingException(Exception): class ParsingException(Exception):
pass pass
@ -127,25 +125,21 @@ class IndeedScraper(Scraper):
description=description, description=description,
company_name=job["company"], company_name=job["company"],
location=Location( location=Location(
city=job.get("jobLocationCity", ""), city=job.get("jobLocationCity"),
state=job.get("jobLocationState", ""), state=job.get("jobLocationState"),
postal_code=job.get("jobLocationPostal", ""),
), ),
job_type=job_type,
compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url_client, job_url=job_url_client,
) )
if compensation:
job_post.compensation = compensation
if job_type:
job_post.job_type = job_type
return job_post return job_post
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: with ThreadPoolExecutor(max_workers=10) as executor:
job_post = process_job(job) job_results: list[Future] = [executor.submit(process_job, job) for job in
job_list.append(job_post) jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]]
job_list = [result.result() for result in job_results if result.result()]
return job_list, total_num_jobs return job_list, total_num_jobs

View File

@ -6,7 +6,7 @@ from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ...jobs import JobPost, Location, JobResponse, JobType from ...jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
@ -117,6 +117,7 @@ class LinkedInScraper(Scraper):
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_type=job_type, job_type=job_type,
compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD")
) )
job_list.append(job_post) job_list.append(job_post)
if ( if (
@ -184,7 +185,6 @@ class LinkedInScraper(Scraper):
employment_type = employment_type_span.get_text(strip=True) employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower() employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "") employment_type = employment_type.replace("-", "")
print(employment_type)
return JobType(employment_type) return JobType(employment_type)

View File

@ -1,5 +1,6 @@
import math import math
import json import json
import re
from datetime import datetime from datetime import datetime
from typing import Optional, Tuple, List from typing import Optional, Tuple, List
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
@ -9,7 +10,7 @@ from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ...scrapers import Scraper, ScraperInput, Site, StatusException from .. import Scraper, ScraperInput, Site, StatusException
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
@ -24,9 +25,12 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
self.session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
def scrape_page( def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session self, scraper_input: ScraperInput, page: int
) -> tuple[list[JobPost], int | None]: ) -> tuple[list[JobPost], int | None]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
@ -51,6 +55,7 @@ class ZipRecruiterScraper(Scraper):
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"page": page, "page": page,
"form": "jobs-landing"
} }
if scraper_input.is_remote: if scraper_input.is_remote:
@ -62,7 +67,7 @@ class ZipRecruiterScraper(Scraper):
if job_type_value: if job_type_value:
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}" params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
response = session.get( response = self.session.get(
self.url + "/jobs-search", self.url + "/jobs-search",
headers=ZipRecruiterScraper.headers(), headers=ZipRecruiterScraper.headers(),
params=params, params=params,
@ -74,69 +79,22 @@ class ZipRecruiterScraper(Scraper):
html_string = response.text html_string = response.text
soup = BeautifulSoup(html_string, "html.parser") soup = BeautifulSoup(html_string, "html.parser")
if page == 1: script_tag = soup.find("script", {"id": "js_variables"})
script_tag = soup.find("script", {"id": "js_variables"}) data = json.loads(script_tag.string)
data = json.loads(script_tag.string)
if page == 1:
job_count = int(data["totalJobCount"].replace(",", "")) job_count = int(data["totalJobCount"].replace(",", ""))
else: else:
job_count = None job_count = None
job_posts = soup.find_all("div", {"class": "job_content"})
def process_job(job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = ZipRecruiterScraper.get_description(
job_url, session
)
if updated_job_url is not None:
job_url = updated_job_url
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
if job_type_element:
job_type_text = (
job_type_element.text.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
if job_type_text == "contractor":
job_type_text = "contract"
job_type = JobType(job_type_text)
else:
job_type = None
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
)
return job_post
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
job_results: list[Future] = [ if "jobList" in data and data["jobList"]:
executor.submit(process_job, job) for job in job_posts jobs_js = data["jobList"]
] job_results = [executor.submit(self.process_job_js, job) for job in jobs_js]
else:
jobs_html = soup.find_all("div", {"class": "job_content"})
job_results = [executor.submit(self.process_job_html, job) for job in
jobs_html]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
@ -148,19 +106,17 @@ class ZipRecruiterScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
pages_to_process = max(3, math.ceil(scraper_input.results_wanted / self.jobs_per_page))
try: try:
#: get first page to initialize session #: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 1, session) job_list, total_results = self.scrape_page(scraper_input, 1)
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page, session) executor.submit(self.scrape_page, scraper_input, page)
for page in range(2, pages_to_process + 1) for page in range(2, pages_to_process + 1)
] ]
@ -169,6 +125,7 @@ class ZipRecruiterScraper(Scraper):
job_list += jobs job_list += jobs
except StatusException as e: except StatusException as e:
return JobResponse( return JobResponse(
success=False, success=False,
@ -192,9 +149,129 @@ class ZipRecruiterScraper(Scraper):
) )
return job_response return job_response
def process_job_html(self, job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = self.get_description(
job_url
)
if updated_job_url is not None:
job_url = updated_job_url
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
if job_type_element:
job_type_text = (
job_type_element.text.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
if job_type_text == "contractor":
job_type_text = "contract"
job_type = JobType(job_type_text)
else:
job_type = None
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
)
return job_post
def process_job_js(self, job: dict) -> JobPost:
# Map the job data to the expected fields by the Pydantic model
title = job.get("Title")
description = BeautifulSoup(job.get("Snippet","").strip(), "html.parser").get_text()
company = job.get("OrgName")
location = Location(city=job.get("City"), state=job.get("State"))
try:
job_type = ZipRecruiterScraper.job_type_from_string(job.get("EmploymentType", "").replace("-", "_").lower())
except ValueError:
# print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}")
return None
formatted_salary = job.get("FormattedSalaryShort", "")
salary_parts = formatted_salary.split(" ")
min_salary_str = salary_parts[0][1:].replace(",", "")
if '.' in min_salary_str:
min_amount = int(float(min_salary_str) * 1000)
else:
min_amount = int(min_salary_str.replace("K", "000"))
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
max_salary_str = salary_parts[2][1:].replace(",", "")
if '.' in max_salary_str:
max_amount = int(float(max_salary_str) * 1000)
else:
max_amount = int(max_salary_str.replace("K", "000"))
else:
max_amount = 0
compensation = Compensation(
interval=CompensationInterval.YEARLY,
min_amount=min_amount,
max_amount=max_amount
)
save_job_url = job.get("SaveJobURL", "")
posted_time_match = re.search(r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url)
if posted_time_match:
date_time_str = posted_time_match.group(1)
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
date_posted = date_posted_obj.date()
else:
date_posted = date.today()
job_url = job.get("JobURL")
return JobPost(
title=title,
description=description,
company_name=company,
location=location,
job_type=job_type,
compensation=compensation,
date_posted=date_posted,
job_url=job_url,
)
return job_post
@staticmethod @staticmethod
def job_type_from_string(value: str) -> Optional[JobType]:
if not value:
return None
if value.lower() == "contractor":
value = "contract"
normalized_value = value.replace("_", "")
for item in JobType:
if item.value == normalized_value:
return item
raise ValueError(f"Invalid value for JobType: {value}")
def get_description( def get_description(
job_page_url: str, session: tls_client.Session self,
job_page_url: str
) -> Tuple[Optional[str], Optional[str]]: ) -> Tuple[Optional[str], Optional[str]]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
@ -202,7 +279,7 @@ class ZipRecruiterScraper(Scraper):
:param session: :param session:
:return: description or None, response url :return: description or None, response url
""" """
response = session.get( response = self.session.get(
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
) )
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):

1851
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,8 @@
[tool.poetry] [tool.poetry]
name = "jobspy" name = "jobscrape"
version = "0.1.0" version = "0.1.0"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com>"] authors = ["Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com>", "Cullen Watson <cullen@cullen.ai>"]
readme = "README.md" readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
@ -16,6 +16,7 @@ pydantic = "^2.3.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.1" pytest = "^7.4.1"
jupyter = "^1.0.0"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]

View File

@ -1,85 +0,0 @@
from concurrent.futures import ThreadPoolExecutor
from .core.scrapers.indeed import IndeedScraper
from .core.scrapers.ziprecruiter import ZipRecruiterScraper
from .core.scrapers.linkedin import LinkedInScraper
from .core.scrapers import (
ScraperInput,
Site,
JobResponse,
)
import pandas as pd
from .core.jobs import JobType
from typing import List, Tuple
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
}
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def scrape_jobs(
site_name: str | Site | List[Site],
search_term: str,
location: str = "",
distance: int = None,
is_remote: bool = False,
job_type: JobType = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15
) -> pd.DataFrame:
"""
Asynchronously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
"""
if type(site_name) == str:
site_name = _map_str_to_site(site_name)
site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput(
site_type=site_type,
search_term=search_term,
location=location,
is_remote=is_remote,
easy_apply=easy_apply,
results_wanted=results_wanted,
)
if distance:
scraper_input.distance = distance
if job_type:
scraper_input.job_type = job_type
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
return site.value, scraped_data
with ThreadPoolExecutor(max_workers=3) as executor:
results = dict(executor.map(scrape_site, scraper_input.site_type))
df = pd.DataFrame()
for site in results:
for job in results[site].jobs:
data = job.json()
data_df = pd.read_json(data, typ='series')
data_df['site'] = site
#: concat
df = pd.concat([df, data_df], axis=1)
return df

View File

@ -1,4 +1,4 @@
from src.jobspy import scrape_jobs from jobscrape import scrape_jobs
def test_indeed(): def test_indeed():

View File

@ -1,4 +1,4 @@
from src.jobspy import scrape_jobs from jobscrape import scrape_jobs
def test_ziprecruiter(): def test_ziprecruiter():