mirror of https://github.com/Bunsly/JobSpy
Library Migration (#31)
parent
7efece8fe9
commit
153ac35248
|
@ -1,3 +0,0 @@
|
|||
{
|
||||
"experimental": "enabled"
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Login to GitHub Docker Registry
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and Push Image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
push: true
|
||||
tags: ghcr.io/${{ github.repository_owner }}/jobspy:latest
|
||||
platforms: linux/amd64,linux/arm64
|
|
@ -0,0 +1,33 @@
|
|||
name: Publish Python 🐍 distributions 📦 to PyPI
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
build-n-publish:
|
||||
name: Build and publish Python 🐍 distributions 📦 to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Install poetry
|
||||
run: >-
|
||||
python3 -m
|
||||
pip install
|
||||
poetry
|
||||
--user
|
||||
|
||||
- name: Build distribution 📦
|
||||
run: >-
|
||||
python3 -m
|
||||
poetry
|
||||
build
|
||||
|
||||
- name: Publish distribution 📦 to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@ -1,89 +0,0 @@
|
|||
name: JobSpy API Tests
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
test_api:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install -r requirements.txt
|
||||
|
||||
- name: Install jq
|
||||
run: sudo apt-get install jq
|
||||
|
||||
- name: Start JobSpy FastAPI app
|
||||
run: uvicorn main:app --host 0.0.0.0 --port 8000 &
|
||||
|
||||
- name: Wait for server to be up
|
||||
run: |
|
||||
for i in {1..10}; do
|
||||
curl -s http://0.0.0.0:8000/api/v1/jobs && break || sleep 1
|
||||
done
|
||||
|
||||
- name: Check health
|
||||
run: |
|
||||
health_status=$(curl -L -s -o /dev/null -w "%{http_code}" http://0.0.0.0:8000/health)
|
||||
|
||||
if [ "$health_status" != "200" ]; then
|
||||
echo "Error: Health check failed with status code $health_status"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# not checking currently because of bad ip at Github's servers being blocked
|
||||
# - name: Check HTTP status to POST /api/v1/jobs/
|
||||
# run: |
|
||||
# response=$(curl -L -s -X 'POST' -H 'Content-Type: application/json' -d '{
|
||||
# "site_type": ["indeed", "linkedin"],
|
||||
# "search_term": "software engineer",
|
||||
# "location": "austin, tx",
|
||||
# "distance": 10,
|
||||
# "job_type": "fulltime",
|
||||
# "results_wanted": 5
|
||||
# }' http://0.0.0.0:8000/api/v1/jobs -w "%{http_code}")
|
||||
#
|
||||
# status_code="${response: -3}"
|
||||
# echo "Received status code: $status_code"
|
||||
#
|
||||
# if [ "$status_code" != "200" ]; then
|
||||
# echo "Error: Expected status code 200, but got $status_code"
|
||||
# exit 1
|
||||
# fi
|
||||
#
|
||||
# echo "${response::-3}" > response.json
|
||||
# cat response.json
|
||||
#
|
||||
# - name: Check error field in response
|
||||
# run: |
|
||||
# global_error=$(jq '.error' response.json)
|
||||
# indeed_error=$(jq '.indeed.error' response.json)
|
||||
# linkedin_error=$(jq '.linkedin.error' response.json)
|
||||
#
|
||||
# if [[ "$indeed_error" != "null" || "$linkedin_error" != "null" ]]; then
|
||||
# echo "Error found in response:"
|
||||
# echo "Global Error: $global_error"
|
||||
# echo "Indeed Error: $indeed_error"
|
||||
# echo "LinkedIn Error: $linkedin_error"
|
||||
# exit 1
|
||||
# fi
|
||||
#
|
||||
# - name: Verify returned_results in response
|
||||
# run: |
|
||||
# indeed_results=$(jq '.indeed.returned_results' response.json)
|
||||
# linkedin_results=$(jq '.linkedin.returned_results' response.json)
|
||||
#
|
||||
# if [[ $indeed_results -ne 5 || $linkedin_results -ne 5 ]]; then
|
||||
# echo "Mismatch in results_wanted and returned_results:"
|
||||
# echo "Indeed: Expected 5, Got $indeed_results"
|
||||
# echo "LinkedIn: Expected 5, Got $linkedin_results"
|
||||
# exit 1
|
||||
# fi
|
|
@ -5,4 +5,5 @@
|
|||
**/__pycache__/
|
||||
*.pyc
|
||||
.env
|
||||
client_secret.json
|
||||
dist
|
||||
/.ipynb_checkpoints/
|
|
@ -1,13 +0,0 @@
|
|||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python: Module",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"args": ["main:app","--reload"]
|
||||
}
|
||||
|
||||
]
|
||||
}
|
15
Dockerfile
15
Dockerfile
|
@ -1,15 +0,0 @@
|
|||
FROM python:3.10-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y jq && \
|
||||
pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
ENV PORT=8000
|
||||
|
||||
CMD sh -c "uvicorn main:app --host 0.0.0.0 --port $PORT"
|
|
@ -0,0 +1,702 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "c3f21577-477d-451e-9914-5d67e8a89075",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>site</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>company_name</th>\n",
|
||||
" <th>city</th>\n",
|
||||
" <th>state</th>\n",
|
||||
" <th>job_type</th>\n",
|
||||
" <th>interval</th>\n",
|
||||
" <th>min_amount</th>\n",
|
||||
" <th>max_amount</th>\n",
|
||||
" <th>job_url</th>\n",
|
||||
" <th>description</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Firmware Engineer</td>\n",
|
||||
" <td>Advanced Motion Controls</td>\n",
|
||||
" <td>Camarillo</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>145000</td>\n",
|
||||
" <td>110000</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=a2e7077fdd3c...</td>\n",
|
||||
" <td>We are looking for an experienced Firmware Eng...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Computer Engineer</td>\n",
|
||||
" <td>Honeywell</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=5a1da623ee75...</td>\n",
|
||||
" <td>Join a team recognized for leadership, innovat...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>Splunk</td>\n",
|
||||
" <td>Remote</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>159500</td>\n",
|
||||
" <td>116000</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=155495ca3f46...</td>\n",
|
||||
" <td>A little about us. Splunk is the key to enterp...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Development Operations Engineer</td>\n",
|
||||
" <td>Stratacache</td>\n",
|
||||
" <td>Dayton</td>\n",
|
||||
" <td>OH</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>90000</td>\n",
|
||||
" <td>83573</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=77cf3540c06e...</td>\n",
|
||||
" <td>Stratacache, Inc. delivers in-store retail exp...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Computer Engineer</td>\n",
|
||||
" <td>Honeywell</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=7fadbb7c936f...</td>\n",
|
||||
" <td>Join a team recognized for leadership, innovat...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Full Stack Developer</td>\n",
|
||||
" <td>Reinventing Geospatial, Inc. (RGi)</td>\n",
|
||||
" <td>Herndon</td>\n",
|
||||
" <td>VA</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=11b2b5b0dd44...</td>\n",
|
||||
" <td>Job Highlights As a Full Stack Software Engine...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>Workiva</td>\n",
|
||||
" <td>Remote</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>134000</td>\n",
|
||||
" <td>79000</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=ec3ab6eb9253...</td>\n",
|
||||
" <td>Are you ready to embark on an exciting journey...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Senior Software Engineer</td>\n",
|
||||
" <td>SciTec</td>\n",
|
||||
" <td>Boulder</td>\n",
|
||||
" <td>CO</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>164000</td>\n",
|
||||
" <td>93000</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=781e4cf0cf6d...</td>\n",
|
||||
" <td>SciTec has been awarded multiple government co...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>Microsoft</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>182600</td>\n",
|
||||
" <td>94300</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=21e05b9e9d96...</td>\n",
|
||||
" <td>At Microsoft we are seeking people who have a ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>indeed</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>Avalon Healthcare Solutions</td>\n",
|
||||
" <td>Remote</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.indeed.com/viewjob?jk=da35b9bb74a0...</td>\n",
|
||||
" <td>Avalon Healthcare Solutions, headquartered in ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>Fieldguide</td>\n",
|
||||
" <td>San Francisco</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3696158160</td>\n",
|
||||
" <td>About us:Fieldguide is establishing a new stat...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer - Early Career</td>\n",
|
||||
" <td>Lockheed Martin</td>\n",
|
||||
" <td>Sunnyvale</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3693012711</td>\n",
|
||||
" <td>Description:By bringing together people that u...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer - Early Career</td>\n",
|
||||
" <td>Lockheed Martin</td>\n",
|
||||
" <td>Edwards</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3700669785</td>\n",
|
||||
" <td>Description:By bringing together people that u...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer - Early Career</td>\n",
|
||||
" <td>Lockheed Martin</td>\n",
|
||||
" <td>Fort Worth</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3701775201</td>\n",
|
||||
" <td>Description:By bringing together people that u...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer - Early Career</td>\n",
|
||||
" <td>Lockheed Martin</td>\n",
|
||||
" <td>Fort Worth</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3701772329</td>\n",
|
||||
" <td>Description:By bringing together people that u...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer - Early Career</td>\n",
|
||||
" <td>Lockheed Martin</td>\n",
|
||||
" <td>Fort Worth</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3701769637</td>\n",
|
||||
" <td>Description:By bringing together people that u...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>SpiderOak</td>\n",
|
||||
" <td>Austin</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3707174719</td>\n",
|
||||
" <td>We're only as strong as our weakest link.In th...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer - Early Career</td>\n",
|
||||
" <td>Lockheed Martin</td>\n",
|
||||
" <td>Fort Worth</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3701770659</td>\n",
|
||||
" <td>Description:By bringing together people that u...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Full-Stack Software Engineer</td>\n",
|
||||
" <td>Rain</td>\n",
|
||||
" <td>New York</td>\n",
|
||||
" <td>NY</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3696158877</td>\n",
|
||||
" <td>Rain’s mission is to create the fastest and ea...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>linkedin</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>Nike</td>\n",
|
||||
" <td>Portland</td>\n",
|
||||
" <td>OR</td>\n",
|
||||
" <td>contract</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://www.linkedin.com/jobs/view/3693340247</td>\n",
|
||||
" <td>Work options: FlexibleWe consider remote, on-p...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>(USA) Software Engineer III - Prototype Engine...</td>\n",
|
||||
" <td>Walmart</td>\n",
|
||||
" <td>Dallas</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>https://click.appcast.io/track/hcgsw4k?cs=ngp&...</td>\n",
|
||||
" <td>We are currently seeking a highly skilled and ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Software Engineer - New Grad</td>\n",
|
||||
" <td>ZipRecruiter</td>\n",
|
||||
" <td>Santa Monica</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>130000</td>\n",
|
||||
" <td>150000</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/ziprecruiter...</td>\n",
|
||||
" <td>We offer a hybrid work environment. Most US-ba...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>22</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Software Developer</td>\n",
|
||||
" <td>Robert Half</td>\n",
|
||||
" <td>Corpus Christi</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>105000</td>\n",
|
||||
" <td>115000</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/robert-half-...</td>\n",
|
||||
" <td>Robert Half has an opening for a Software Deve...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>23</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Software Engineer</td>\n",
|
||||
" <td>Advantage Technical</td>\n",
|
||||
" <td>Ontario</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>100000</td>\n",
|
||||
" <td>150000</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/advantage-te...</td>\n",
|
||||
" <td>New career opportunity available with major Ma...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>24</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Software Developer</td>\n",
|
||||
" <td>Robert Half</td>\n",
|
||||
" <td>Tucson</td>\n",
|
||||
" <td>AZ</td>\n",
|
||||
" <td>temporary</td>\n",
|
||||
" <td>hourly</td>\n",
|
||||
" <td>47</td>\n",
|
||||
" <td>55</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/robert-half-...</td>\n",
|
||||
" <td>Robert Half is accepting inquiries for a SQL S...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Full Stack Software Engineer</td>\n",
|
||||
" <td>ZipRecruiter</td>\n",
|
||||
" <td>Phoenix</td>\n",
|
||||
" <td>AZ</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>105000</td>\n",
|
||||
" <td>145000</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/ziprecruiter...</td>\n",
|
||||
" <td>We offer a hybrid work environment. Most US-ba...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>26</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Software Developer IV</td>\n",
|
||||
" <td>Kforce Inc.</td>\n",
|
||||
" <td>Mountain View</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>contract</td>\n",
|
||||
" <td>hourly</td>\n",
|
||||
" <td>55</td>\n",
|
||||
" <td>75</td>\n",
|
||||
" <td>https://www.kforce.com/Jobs/job.aspx?job=1696~...</td>\n",
|
||||
" <td>Kforce has a client that is seeking a Software...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>27</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Software Developer | Onsite | Omaha, NE - Omaha</td>\n",
|
||||
" <td>OneStaff Medical</td>\n",
|
||||
" <td>Omaha</td>\n",
|
||||
" <td>NE</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>60000</td>\n",
|
||||
" <td>110000</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/onestaff-med...</td>\n",
|
||||
" <td>Company Description: We are looking for a well...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>28</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Senior Software Engineer</td>\n",
|
||||
" <td>RightStaff, Inc.</td>\n",
|
||||
" <td>Dallas</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>120000</td>\n",
|
||||
" <td>180000</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/rightstaff-i...</td>\n",
|
||||
" <td>Job Description:We are seeking a talented and ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>zip_recruiter</td>\n",
|
||||
" <td>Software Developer - .Net Core - 12886</td>\n",
|
||||
" <td>Walker Elliott</td>\n",
|
||||
" <td>Dallas</td>\n",
|
||||
" <td>TX</td>\n",
|
||||
" <td>fulltime</td>\n",
|
||||
" <td>yearly</td>\n",
|
||||
" <td>105000</td>\n",
|
||||
" <td>130000</td>\n",
|
||||
" <td>https://www.ziprecruiter.com/jobs/walker-ellio...</td>\n",
|
||||
" <td>Our highly successful DFW based client has bee...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" site title \\\n",
|
||||
"0 indeed Firmware Engineer \n",
|
||||
"1 indeed Computer Engineer \n",
|
||||
"2 indeed Software Engineer \n",
|
||||
"3 indeed Development Operations Engineer \n",
|
||||
"4 indeed Computer Engineer \n",
|
||||
"5 indeed Full Stack Developer \n",
|
||||
"6 indeed Software Engineer \n",
|
||||
"7 indeed Senior Software Engineer \n",
|
||||
"8 indeed Software Engineer \n",
|
||||
"9 indeed Software Engineer \n",
|
||||
"10 linkedin Software Engineer \n",
|
||||
"11 linkedin Software Engineer - Early Career \n",
|
||||
"12 linkedin Software Engineer - Early Career \n",
|
||||
"13 linkedin Software Engineer - Early Career \n",
|
||||
"14 linkedin Software Engineer - Early Career \n",
|
||||
"15 linkedin Software Engineer - Early Career \n",
|
||||
"16 linkedin Software Engineer \n",
|
||||
"17 linkedin Software Engineer - Early Career \n",
|
||||
"18 linkedin Full-Stack Software Engineer \n",
|
||||
"19 linkedin Software Engineer \n",
|
||||
"20 zip_recruiter (USA) Software Engineer III - Prototype Engine... \n",
|
||||
"21 zip_recruiter Software Engineer - New Grad \n",
|
||||
"22 zip_recruiter Software Developer \n",
|
||||
"23 zip_recruiter Software Engineer \n",
|
||||
"24 zip_recruiter Software Developer \n",
|
||||
"25 zip_recruiter Full Stack Software Engineer \n",
|
||||
"26 zip_recruiter Software Developer IV \n",
|
||||
"27 zip_recruiter Software Developer | Onsite | Omaha, NE - Omaha \n",
|
||||
"28 zip_recruiter Senior Software Engineer \n",
|
||||
"29 zip_recruiter Software Developer - .Net Core - 12886 \n",
|
||||
"\n",
|
||||
" company_name city state job_type \\\n",
|
||||
"0 Advanced Motion Controls Camarillo CA fulltime \n",
|
||||
"1 Honeywell None fulltime \n",
|
||||
"2 Splunk Remote None fulltime \n",
|
||||
"3 Stratacache Dayton OH fulltime \n",
|
||||
"4 Honeywell None fulltime \n",
|
||||
"5 Reinventing Geospatial, Inc. (RGi) Herndon VA fulltime \n",
|
||||
"6 Workiva Remote None None \n",
|
||||
"7 SciTec Boulder CO fulltime \n",
|
||||
"8 Microsoft None fulltime \n",
|
||||
"9 Avalon Healthcare Solutions Remote None None \n",
|
||||
"10 Fieldguide San Francisco CA fulltime \n",
|
||||
"11 Lockheed Martin Sunnyvale CA fulltime \n",
|
||||
"12 Lockheed Martin Edwards CA fulltime \n",
|
||||
"13 Lockheed Martin Fort Worth TX fulltime \n",
|
||||
"14 Lockheed Martin Fort Worth TX fulltime \n",
|
||||
"15 Lockheed Martin Fort Worth TX fulltime \n",
|
||||
"16 SpiderOak Austin TX fulltime \n",
|
||||
"17 Lockheed Martin Fort Worth TX fulltime \n",
|
||||
"18 Rain New York NY fulltime \n",
|
||||
"19 Nike Portland OR contract \n",
|
||||
"20 Walmart Dallas TX None \n",
|
||||
"21 ZipRecruiter Santa Monica CA fulltime \n",
|
||||
"22 Robert Half Corpus Christi TX fulltime \n",
|
||||
"23 Advantage Technical Ontario CA fulltime \n",
|
||||
"24 Robert Half Tucson AZ temporary \n",
|
||||
"25 ZipRecruiter Phoenix AZ fulltime \n",
|
||||
"26 Kforce Inc. Mountain View CA contract \n",
|
||||
"27 OneStaff Medical Omaha NE fulltime \n",
|
||||
"28 RightStaff, Inc. Dallas TX fulltime \n",
|
||||
"29 Walker Elliott Dallas TX fulltime \n",
|
||||
"\n",
|
||||
" interval min_amount max_amount \\\n",
|
||||
"0 yearly 145000 110000 \n",
|
||||
"1 None None None \n",
|
||||
"2 yearly 159500 116000 \n",
|
||||
"3 yearly 90000 83573 \n",
|
||||
"4 None None None \n",
|
||||
"5 None None None \n",
|
||||
"6 yearly 134000 79000 \n",
|
||||
"7 yearly 164000 93000 \n",
|
||||
"8 yearly 182600 94300 \n",
|
||||
"9 None None None \n",
|
||||
"10 yearly None None \n",
|
||||
"11 yearly None None \n",
|
||||
"12 yearly None None \n",
|
||||
"13 yearly None None \n",
|
||||
"14 yearly None None \n",
|
||||
"15 yearly None None \n",
|
||||
"16 yearly None None \n",
|
||||
"17 yearly None None \n",
|
||||
"18 yearly None None \n",
|
||||
"19 yearly None None \n",
|
||||
"20 None None None \n",
|
||||
"21 yearly 130000 150000 \n",
|
||||
"22 yearly 105000 115000 \n",
|
||||
"23 yearly 100000 150000 \n",
|
||||
"24 hourly 47 55 \n",
|
||||
"25 yearly 105000 145000 \n",
|
||||
"26 hourly 55 75 \n",
|
||||
"27 yearly 60000 110000 \n",
|
||||
"28 yearly 120000 180000 \n",
|
||||
"29 yearly 105000 130000 \n",
|
||||
"\n",
|
||||
" job_url \\\n",
|
||||
"0 https://www.indeed.com/viewjob?jk=a2e7077fdd3c... \n",
|
||||
"1 https://www.indeed.com/viewjob?jk=5a1da623ee75... \n",
|
||||
"2 https://www.indeed.com/viewjob?jk=155495ca3f46... \n",
|
||||
"3 https://www.indeed.com/viewjob?jk=77cf3540c06e... \n",
|
||||
"4 https://www.indeed.com/viewjob?jk=7fadbb7c936f... \n",
|
||||
"5 https://www.indeed.com/viewjob?jk=11b2b5b0dd44... \n",
|
||||
"6 https://www.indeed.com/viewjob?jk=ec3ab6eb9253... \n",
|
||||
"7 https://www.indeed.com/viewjob?jk=781e4cf0cf6d... \n",
|
||||
"8 https://www.indeed.com/viewjob?jk=21e05b9e9d96... \n",
|
||||
"9 https://www.indeed.com/viewjob?jk=da35b9bb74a0... \n",
|
||||
"10 https://www.linkedin.com/jobs/view/3696158160 \n",
|
||||
"11 https://www.linkedin.com/jobs/view/3693012711 \n",
|
||||
"12 https://www.linkedin.com/jobs/view/3700669785 \n",
|
||||
"13 https://www.linkedin.com/jobs/view/3701775201 \n",
|
||||
"14 https://www.linkedin.com/jobs/view/3701772329 \n",
|
||||
"15 https://www.linkedin.com/jobs/view/3701769637 \n",
|
||||
"16 https://www.linkedin.com/jobs/view/3707174719 \n",
|
||||
"17 https://www.linkedin.com/jobs/view/3701770659 \n",
|
||||
"18 https://www.linkedin.com/jobs/view/3696158877 \n",
|
||||
"19 https://www.linkedin.com/jobs/view/3693340247 \n",
|
||||
"20 https://click.appcast.io/track/hcgsw4k?cs=ngp&... \n",
|
||||
"21 https://www.ziprecruiter.com/jobs/ziprecruiter... \n",
|
||||
"22 https://www.ziprecruiter.com/jobs/robert-half-... \n",
|
||||
"23 https://www.ziprecruiter.com/jobs/advantage-te... \n",
|
||||
"24 https://www.ziprecruiter.com/jobs/robert-half-... \n",
|
||||
"25 https://www.ziprecruiter.com/jobs/ziprecruiter... \n",
|
||||
"26 https://www.kforce.com/Jobs/job.aspx?job=1696~... \n",
|
||||
"27 https://www.ziprecruiter.com/jobs/onestaff-med... \n",
|
||||
"28 https://www.ziprecruiter.com/jobs/rightstaff-i... \n",
|
||||
"29 https://www.ziprecruiter.com/jobs/walker-ellio... \n",
|
||||
"\n",
|
||||
" description \n",
|
||||
"0 We are looking for an experienced Firmware Eng... \n",
|
||||
"1 Join a team recognized for leadership, innovat... \n",
|
||||
"2 A little about us. Splunk is the key to enterp... \n",
|
||||
"3 Stratacache, Inc. delivers in-store retail exp... \n",
|
||||
"4 Join a team recognized for leadership, innovat... \n",
|
||||
"5 Job Highlights As a Full Stack Software Engine... \n",
|
||||
"6 Are you ready to embark on an exciting journey... \n",
|
||||
"7 SciTec has been awarded multiple government co... \n",
|
||||
"8 At Microsoft we are seeking people who have a ... \n",
|
||||
"9 Avalon Healthcare Solutions, headquartered in ... \n",
|
||||
"10 About us:Fieldguide is establishing a new stat... \n",
|
||||
"11 Description:By bringing together people that u... \n",
|
||||
"12 Description:By bringing together people that u... \n",
|
||||
"13 Description:By bringing together people that u... \n",
|
||||
"14 Description:By bringing together people that u... \n",
|
||||
"15 Description:By bringing together people that u... \n",
|
||||
"16 We're only as strong as our weakest link.In th... \n",
|
||||
"17 Description:By bringing together people that u... \n",
|
||||
"18 Rain’s mission is to create the fastest and ea... \n",
|
||||
"19 Work options: FlexibleWe consider remote, on-p... \n",
|
||||
"20 We are currently seeking a highly skilled and ... \n",
|
||||
"21 We offer a hybrid work environment. Most US-ba... \n",
|
||||
"22 Robert Half has an opening for a Software Deve... \n",
|
||||
"23 New career opportunity available with major Ma... \n",
|
||||
"24 Robert Half is accepting inquiries for a SQL S... \n",
|
||||
"25 We offer a hybrid work environment. Most US-ba... \n",
|
||||
"26 Kforce has a client that is seeking a Software... \n",
|
||||
"27 Company Description: We are looking for a well... \n",
|
||||
"28 Job Description:We are seeking a talented and ... \n",
|
||||
"29 Our highly successful DFW based client has bee... "
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from jobscrape import scrape_jobs\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"jobs: pd.DataFrame = scrape_jobs(\n",
|
||||
" site_name=[\"indeed\", \"linkedin\", \"zip_recruiter\"],\n",
|
||||
" search_term=\"software engineer\",\n",
|
||||
" results_wanted=10\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if jobs.empty:\n",
|
||||
" print(\"No jobs found.\")\n",
|
||||
"else:\n",
|
||||
"\n",
|
||||
" #1 print\n",
|
||||
" pd.set_option('display.max_columns', None)\n",
|
||||
" pd.set_option('display.max_rows', None)\n",
|
||||
" pd.set_option('display.width', None)\n",
|
||||
" pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc\n",
|
||||
" print(jobs)\n",
|
||||
"\n",
|
||||
" #2 display in Jupyter Notebook\n",
|
||||
" display(jobs)\n",
|
||||
"\n",
|
||||
" #3 output to csv\n",
|
||||
" jobs.to_csv('jobs.csv', index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "efd667ef-fdf0-452a-b5e5-ce6825755be7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1574dc17-0a42-4655-964f-5c03a6d3deb0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "my-poetry-env",
|
||||
"language": "python",
|
||||
"name": "my-poetry-env"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
290
README.md
290
README.md
|
@ -1,240 +1,100 @@
|
|||
# JobSpy AIO Scraper
|
||||
# JobSpy
|
||||
|
||||
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
||||
## Features
|
||||
|
||||
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
|
||||
- Returns jobs as JSON or CSV with title, location, company, description & other data
|
||||
- Imports directly into **Google Sheets**
|
||||
- Optional JWT authorization
|
||||
- Aggregates the job postings in a Pandas DataFrame
|
||||
|
||||
![jobspy_gsheet](https://github.com/cullenwatson/JobSpy/assets/78247585/9f0a997c-4e33-4167-b04e-31ab1f606edb)
|
||||
### Installation
|
||||
`pip install jobscrape`
|
||||
|
||||
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
||||
|
||||
### Usage
|
||||
|
||||
```python
|
||||
from jobscrape import scrape_jobs
|
||||
import pandas as pd
|
||||
|
||||
jobs: pd.DataFrame = scrape_jobs(
|
||||
site_name=["indeed", "linkedin", "zip_recruiter"],
|
||||
search_term="software engineer",
|
||||
results_wanted=10
|
||||
)
|
||||
|
||||
if jobs.empty:
|
||||
print("No jobs found.")
|
||||
else:
|
||||
|
||||
#1 print
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.max_rows', None)
|
||||
pd.set_option('display.width', None)
|
||||
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
|
||||
print(jobs)
|
||||
|
||||
#2 display in Jupyter Notebook
|
||||
display(jobs)
|
||||
|
||||
#3 output to csv
|
||||
jobs.to_csv('jobs.csv', index=False)
|
||||
```
|
||||
|
||||
### Output
|
||||
```
|
||||
site title company_name city state job_type interval min_amount max_amount job_url description
|
||||
indeed Software Engineer AMERICAN SYSTEMS Arlington VA None yearly 200000 150000 https://www.indeed.com/viewjob?jk=5e409e577046... THIS POSITION COMES WITH A 10K SIGNING BONUS! ...
|
||||
indeed Senior Software Engineer TherapyNotes.com Philadelphia PA fulltime yearly 135000 110000 https://www.indeed.com/viewjob?jk=da39574a40cb... About Us TherapyNotes is the national leader i...
|
||||
linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale CA fulltime yearly None None https://www.linkedin.com/jobs/view/3693012711 Description:By bringing together people that u...
|
||||
linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rain’s mission is to create the fastest and ea...
|
||||
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
|
||||
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme.```
|
||||
```
|
||||
### Parameters for `scrape_jobs()`
|
||||
|
||||
### API
|
||||
|
||||
POST `/api/v1/jobs/`
|
||||
### Request Schema
|
||||
```plaintext
|
||||
Required
|
||||
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
|
||||
└── search_term (str)
|
||||
Optional
|
||||
├── location (int)
|
||||
├── distance (int)
|
||||
├── distance (int): in miles
|
||||
├── job_type (enum): fulltime, parttime, internship, contract
|
||||
├── is_remote (bool)
|
||||
├── results_wanted (int): per site_type
|
||||
├── easy_apply (bool): only for linkedin
|
||||
└── output_format (enum): json, csv, gsheet
|
||||
```
|
||||
### Request Example
|
||||
```json
|
||||
"site_type": ["indeed", "linkedin"],
|
||||
"search_term": "software engineer",
|
||||
"location": "austin, tx",
|
||||
"distance": 10,
|
||||
"job_type": "fulltime",
|
||||
"results_wanted": 15
|
||||
"output_format": "gsheet"
|
||||
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
||||
├── easy_apply (bool): filters for jobs on LinkedIn that have the 'Easy Apply' option
|
||||
```
|
||||
|
||||
### Response Schema
|
||||
```plaintext
|
||||
site_type (enum):
|
||||
JobResponse
|
||||
├── success (bool)
|
||||
├── error (str)
|
||||
├── jobs (List[JobPost])
|
||||
│ └── JobPost
|
||||
│ ├── title (str)
|
||||
│ ├── company_name (str)
|
||||
│ ├── job_url (str)
|
||||
│ ├── location (object)
|
||||
│ │ ├── country (str)
|
||||
│ │ ├── city (str)
|
||||
│ │ ├── state (str)
|
||||
│ ├── description (str)
|
||||
│ ├── job_type (enum)
|
||||
│ ├── compensation (object)
|
||||
│ │ ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly
|
||||
│ │ ├── min_amount (float)
|
||||
│ │ ├── max_amount (float)
|
||||
│ │ └── currency (str)
|
||||
│ └── date_posted (datetime)
|
||||
│
|
||||
├── total_results (int)
|
||||
└── returned_results (int)
|
||||
```
|
||||
### Response Example (GOOGLE SHEETS)
|
||||
```json
|
||||
{
|
||||
"status": "Successfully uploaded to Google Sheets",
|
||||
"error": null,
|
||||
"linkedin": null,
|
||||
"indeed": null,
|
||||
"zip_recruiter": null
|
||||
}
|
||||
```
|
||||
### Response Example (JSON)
|
||||
```json
|
||||
{
|
||||
"indeed": {
|
||||
"success": true,
|
||||
"error": null,
|
||||
"jobs": [
|
||||
{
|
||||
"title": "Software Engineer",
|
||||
"company_name": "INTEL",
|
||||
"job_url": "https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228",
|
||||
"location": {
|
||||
"country": "USA",
|
||||
"city": "Austin",
|
||||
"state": "TX",
|
||||
},
|
||||
"description": "Job Description Designs, develops, tests, and debugs..."
|
||||
"job_type": "fulltime",
|
||||
"compensation": {
|
||||
"interval": "yearly",
|
||||
"min_amount": 209760.0,
|
||||
"max_amount": 139480.0,
|
||||
"currency": "USD"
|
||||
},
|
||||
"date_posted": "2023-08-18T00:00:00"
|
||||
}, ...
|
||||
],
|
||||
"total_results": 845,
|
||||
"returned_results": 15
|
||||
},
|
||||
"linkedin": {
|
||||
"success": true,
|
||||
"error": null,
|
||||
"jobs": [
|
||||
{
|
||||
"title": "Software Engineer 1",
|
||||
"company_name": "Public Partnerships | PPL",
|
||||
"job_url": "https://www.linkedin.com/jobs/view/3690013792",
|
||||
"location": {
|
||||
"country": "USA",
|
||||
"city": "Austin",
|
||||
"state": "TX",
|
||||
},
|
||||
"description": "Public Partnerships LLC supports individuals with disabilities..."
|
||||
"job_type": null,
|
||||
"compensation": null,
|
||||
"date_posted": "2023-07-31T00:00:00"
|
||||
}, ...
|
||||
],
|
||||
"total_results": 2000,
|
||||
"returned_results": 15
|
||||
}
|
||||
}
|
||||
```
|
||||
### Response Example (CSV)
|
||||
```
|
||||
Site, Title, Company Name, Job URL, Country, City, State, Job Type, Compensation Interval, Min Amount, Max Amount, Currency, Date Posted, Description
|
||||
indeed, Software Engineer, INTEL, https://www.indeed.com/jobs/viewjob?jk=a2cfbb98d2002228, USA, Austin, TX, fulltime, yearly, 209760.0, 139480.0, USD, 2023-08-18T00:00:00, Job Description Designs...
|
||||
linkedin, Software Engineer 1, Public Partnerships | PPL, https://www.linkedin.com/jobs/view/3690013792, USA, Austin, TX, , , , , , 2023-07-31T00:00:00, Public Partnerships LLC supports...
|
||||
JobPost
|
||||
├── title (str)
|
||||
├── company_name (str)
|
||||
├── job_url (str)
|
||||
├── location (object)
|
||||
│ ├── country (str)
|
||||
│ ├── city (str)
|
||||
│ ├── state (str)
|
||||
├── description (str)
|
||||
├── job_type (enum)
|
||||
├── compensation (object)
|
||||
│ ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly
|
||||
│ ├── min_amount (float)
|
||||
│ ├── max_amount (float)
|
||||
│ └── currency (str)
|
||||
└── date_posted (datetime)
|
||||
|
||||
```
|
||||
|
||||
## Installation
|
||||
### Docker Setup
|
||||
_Requires [Docker Desktop](https://www.docker.com/products/docker-desktop/)_
|
||||
|
||||
[JobSpy API Image](https://ghcr.io/cullenwatson/jobspy:latest) is continuously updated and available on GitHub Container Registry.
|
||||
### FAQ
|
||||
|
||||
To pull the Docker image:
|
||||
|
||||
```bash
|
||||
docker pull ghcr.io/cullenwatson/jobspy:latest
|
||||
```
|
||||
#### Encountering issues with your queries?
|
||||
|
||||
#### Params
|
||||
Try reducing the number of `results_wanted` and/or broadening the filters. If problems persist, please submit an issue.
|
||||
|
||||
By default:
|
||||
* Port: `8000`
|
||||
* Google sheet name: `JobSpy`
|
||||
* Relative path of `client_secret.json` (for Google Sheets, see below to obtain)
|
||||
|
||||
#### Received a response code 429?
|
||||
This means you've been blocked by the job board site for sending too many requests. Consider waiting a few seconds, or try using a VPN. Proxy support coming soon.
|
||||
|
||||
To run the image with these default settings, use:
|
||||
|
||||
Example (Cmd Prompt - Windows):
|
||||
```bash
|
||||
docker run -v %cd%/client_secret.json:/app/client_secret.json -p 8000:8000 ghcr.io/cullenwatson/jobspy
|
||||
```
|
||||
|
||||
Example (Unix):
|
||||
```bash
|
||||
docker run -v $(pwd)/client_secret.json:/app/client_secret.json -p 8000:8000 ghcr.io/cullenwatson/jobspy
|
||||
```
|
||||
|
||||
#### Using custom params
|
||||
|
||||
Example:
|
||||
* Port: `8030`
|
||||
* Google sheet name: `CustomName`
|
||||
* Absolute path of `client_secret.json`: `C:\config\client_secret.json`
|
||||
|
||||
To pass these custom params:
|
||||
```bash
|
||||
docker run -v C:\config\client_secret.json:/app/client_secret.json -e GSHEET_NAME=CustomName -e PORT=8030 -p 8030:8030 ghcr.io/cullenwatson/jobspy
|
||||
```
|
||||
|
||||
### Python installation (alternative to Docker)
|
||||
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
|
||||
1. Clone this repository `git clone https://github.com/cullenwatson/jobspy`
|
||||
2. Install the dependencies with `pip install -r requirements.txt`
|
||||
4. Run the server with `uvicorn main:app --reload`
|
||||
|
||||
### Google Sheets Setup
|
||||
|
||||
#### Obtaining an Access Key: [Video Guide](https://youtu.be/w533wJuilao?si=5u3m50pRtdhqkg9Z&t=43)
|
||||
* Enable the [Google Sheets & Google Drive API](https://console.cloud.google.com/)
|
||||
* Create credentials -> service account -> create & continue
|
||||
* Select role -> basic: editor -> done
|
||||
* Click on the email you just created in the service account list
|
||||
* Go to the Keys tab -> add key -> create new key -> JSON -> Create
|
||||
|
||||
#### Using the key in the repo
|
||||
* Copy the key file into the JobSpy repo as `client_secret.json`
|
||||
* Go to [my template sheet](https://docs.google.com/spreadsheets/d/1mOgb-ZGZy_YIhnW9OCqIVvkFwiKFvhMBjNcbakW7BLo/edit?usp=sharing): File -> Make a Copy -> Rename to JobSpy
|
||||
* Share the Google sheet with the email located in the field `client_email` in the `client_secret.json` above with editor rights
|
||||
* If you changed the name of the sheet:
|
||||
- Python install: add `.env` in the repo and add `GSHEET_NAME` param with the sheet name as the value, e.g. `GSHEET_NAME=CustomName`
|
||||
- Docker install: use custom param `-e GSHEET_NAME=CustomName` in `docker run` (see above)
|
||||
|
||||
### How to call the API
|
||||
|
||||
#### [Postman](https://www.postman.com/downloads/) (preferred):
|
||||
To use Postman:
|
||||
1. Locate the files in the `/postman/` directory.
|
||||
2. Import the Postman collection and environment JSON files.
|
||||
|
||||
#### Swagger UI:
|
||||
Or you can call the API with the interactive documentation at [localhost:8000/docs](http://localhost:8000/docs).
|
||||
|
||||
## FAQ
|
||||
|
||||
### I'm having issues with my queries. What should I do?
|
||||
|
||||
Try reducing the number of `results_wanted` and/or broadening the filters. If issues still persist, feel free to submit an issue.
|
||||
|
||||
### I'm getting response code 429. What should I do?
|
||||
You have been blocked by the job board site for sending too many requests. Wait a couple seconds or use a VPN.
|
||||
|
||||
### How to enable auth?
|
||||
|
||||
Change `AUTH_REQUIRED` in `/settings.py` to `True`
|
||||
|
||||
The auth uses [supabase](https://supabase.com). Create a project with a `users` table and disable RLS.
|
||||
|
||||
<img src="https://github.com/cullenwatson/jobspy/assets/78247585/03af18e1-5386-49ad-a2cf-d34232d9d747" width="500">
|
||||
|
||||
Add these three environment variables:
|
||||
|
||||
- `SUPABASE_URL`: go to project settings -> API -> Project URL
|
||||
- `SUPABASE_KEY`: go to project settings -> API -> service_role secret
|
||||
- `JWT_SECRET_KEY` - type `openssl rand -hex 32` in terminal to create a 32 byte secret key
|
||||
|
||||
Use these endpoints to register and get an access token:
|
||||
|
||||
![image](https://github.com/cullenwatson/jobspy/assets/78247585/c84c33ec-1fe8-4152-9c8c-6c4334aecfc3)
|
||||
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
from fastapi import APIRouter
|
||||
from api.auth import router as auth_router
|
||||
from .v1 import router as v1_router
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/api",
|
||||
)
|
||||
router.include_router(v1_router)
|
||||
router.include_router(auth_router)
|
|
@ -1,8 +0,0 @@
|
|||
from fastapi import APIRouter
|
||||
|
||||
from api.auth.token import router as token_router
|
||||
from api.auth.register import router as register_router
|
||||
|
||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
||||
router.include_router(token_router)
|
||||
router.include_router(register_router)
|
|
@ -1,65 +0,0 @@
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from jose import jwt, JWTError
|
||||
from fastapi import HTTPException, status, Depends
|
||||
from fastapi.security import OAuth2PasswordBearer
|
||||
|
||||
from api.core.users import TokenData
|
||||
from api.auth.db_utils import UserInDB, get_user
|
||||
|
||||
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token")
|
||||
|
||||
|
||||
def create_access_token(data: dict) -> str:
|
||||
"""
|
||||
Creates a JWT token based on the data provided.
|
||||
:param data
|
||||
:return: encoded_jwt
|
||||
"""
|
||||
to_encode = data.copy()
|
||||
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
||||
to_encode.update({"exp": expire})
|
||||
encoded_jwt = jwt.encode(to_encode, JWT_SECRET_KEY, algorithm=ALGORITHM)
|
||||
return encoded_jwt
|
||||
|
||||
|
||||
async def get_current_user(token: str = Depends(oauth2_scheme)):
|
||||
"""
|
||||
Returns the current user associated with the provided JWT token.
|
||||
:param token
|
||||
:raises HTTPException: If the token is invalid or the user does not exist.
|
||||
:return: The UserInDB instance associated with the token.
|
||||
"""
|
||||
credential_exception = HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
try:
|
||||
payload = jwt.decode(token, JWT_SECRET_KEY, algorithms=[ALGORITHM])
|
||||
username: str = payload.get("sub")
|
||||
if username is None:
|
||||
raise credential_exception
|
||||
token_data = TokenData(username=username)
|
||||
except JWTError:
|
||||
raise credential_exception
|
||||
|
||||
current_user = get_user(token_data.username)
|
||||
if current_user is None:
|
||||
raise credential_exception
|
||||
return current_user
|
||||
|
||||
|
||||
async def get_active_current_user(current_user: UserInDB = Depends(get_current_user)):
|
||||
"""
|
||||
Returns the current user if the user account is active.
|
||||
|
||||
:param current_user: A UserInDB instance representing the current user.
|
||||
:raises HTTPException: If the user account is inactive.
|
||||
:return: The UserInDB instance if the user account is active.
|
||||
"""
|
||||
if current_user.disabled:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED, detail="Inactive user."
|
||||
)
|
||||
return current_user
|
|
@ -1,89 +0,0 @@
|
|||
from typing import Optional, Union
|
||||
|
||||
from passlib.context import CryptContext
|
||||
from supabase_py import create_client, Client
|
||||
from fastapi import HTTPException, status
|
||||
|
||||
from api.core.users import UserInDB
|
||||
from settings import SUPABASE_URL, SUPABASE_KEY
|
||||
|
||||
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
||||
if SUPABASE_URL:
|
||||
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
||||
|
||||
|
||||
def create_user(user_create: UserInDB):
|
||||
"""
|
||||
Creates a new user record in the 'users' table in Supabase.
|
||||
|
||||
:param user_create: The data of the user to be created.
|
||||
:raises HTTPException: If an error occurs while creating the user.
|
||||
:return: The result of the insert operation.
|
||||
"""
|
||||
result = supabase.table("users").insert(user_create.dict()).execute()
|
||||
print(f"Insert result: {result}")
|
||||
|
||||
if "error" in result and result["error"]:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"User could not be created due to {result['error']['message']}",
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_user(username: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Retrieves a user from the 'users' table by their username.
|
||||
|
||||
:param username: The username of the user to retrieve.
|
||||
:return: The user data if found, otherwise None.
|
||||
"""
|
||||
result = supabase.table("users").select().eq("username", username).execute()
|
||||
|
||||
if "error" in result and result["error"]:
|
||||
print(f"Error: {result['error']['message']}")
|
||||
return None
|
||||
else:
|
||||
if result["data"]:
|
||||
user_data = result["data"][0]
|
||||
return UserInDB(**user_data)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def verify_password(password: str, hashed_password: str) -> bool:
|
||||
"""
|
||||
Verifies a password against a hashed password using the bcrypt hashing algorithm.
|
||||
|
||||
:param password: The plaintext password to verify.
|
||||
:param hashed_password: The hashed password to compare against.
|
||||
:return: True if the password matches the hashed password, otherwise False.
|
||||
"""
|
||||
return pwd_context.verify(password, hashed_password)
|
||||
|
||||
|
||||
def get_password_hash(password: str) -> str:
|
||||
"""
|
||||
Hashes a password using the bcrypt hashing algorithm.
|
||||
|
||||
:param password: The plaintext password to hash.
|
||||
:return: The hashed password
|
||||
"""
|
||||
return pwd_context.hash(password)
|
||||
|
||||
|
||||
def authenticate_user(username: str, password: str) -> Union[UserInDB, bool]:
|
||||
"""
|
||||
Authenticates a user based on their username and password.
|
||||
|
||||
:param username: The username of the user to authenticate.
|
||||
:param password: The plaintext password to authenticate.
|
||||
:return: The authenticated user if the username and password are correct, otherwise False.
|
||||
"""
|
||||
user = get_user(username)
|
||||
if not user:
|
||||
return False
|
||||
if not verify_password(password, user.hashed_password):
|
||||
return False
|
||||
return user
|
|
@ -1,33 +0,0 @@
|
|||
from fastapi import APIRouter, HTTPException, status
|
||||
from api.core.users import UserCreate, UserInDB
|
||||
from api.auth.db_utils import get_user, get_password_hash, create_user
|
||||
|
||||
router = APIRouter(prefix="/register")
|
||||
|
||||
|
||||
@router.post("/", response_model=dict)
|
||||
async def register_new_user(user: UserCreate) -> dict:
|
||||
"""
|
||||
Creates new user
|
||||
:param user:
|
||||
:raises HTTPException: If the username already exists.
|
||||
:return: A dictionary containing a detail key with a success message.
|
||||
"""
|
||||
existing_user = get_user(user.username)
|
||||
if existing_user is not None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Username already exists",
|
||||
)
|
||||
|
||||
hashed_password = get_password_hash(user.password)
|
||||
user_create = UserInDB(
|
||||
username=user.username,
|
||||
email=user.email,
|
||||
full_name=user.full_name,
|
||||
hashed_password=hashed_password,
|
||||
disabled=False,
|
||||
)
|
||||
create_user(user_create)
|
||||
|
||||
return {"detail": "User created successfully"}
|
|
@ -1,30 +0,0 @@
|
|||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from fastapi.security import OAuth2PasswordRequestForm
|
||||
|
||||
from api.core.users import Token
|
||||
from api.auth.db_utils import authenticate_user
|
||||
from api.auth.auth_utils import create_access_token
|
||||
|
||||
router = APIRouter(prefix="/token")
|
||||
|
||||
|
||||
@router.post("/", response_model=Token)
|
||||
async def login_for_access_token(
|
||||
form_data: OAuth2PasswordRequestForm = Depends(),
|
||||
) -> Token:
|
||||
"""
|
||||
Authenticates a user and provides an access token.
|
||||
:param form_data: OAuth2PasswordRequestForm object containing the user's credentials.
|
||||
:raises HTTPException: If the user cannot be authenticated.
|
||||
:return: A Token object containing the access token and the token type.
|
||||
"""
|
||||
user = authenticate_user(form_data.username, form_data.password)
|
||||
if not user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect username or password",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
access_token = create_access_token(data={"sub": user.username})
|
||||
return Token(access_token=access_token, token_type="bearer")
|
|
@ -1,7 +0,0 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class OutputFormat(Enum):
|
||||
CSV = "csv"
|
||||
JSON = "json"
|
||||
GSHEET = "gsheet"
|
|
@ -1,133 +0,0 @@
|
|||
import gspread
|
||||
from oauth2client.service_account import ServiceAccountCredentials
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
from datetime import datetime
|
||||
|
||||
from ...jobs import *
|
||||
from ...scrapers import *
|
||||
from settings import *
|
||||
|
||||
|
||||
class CSVFormatter:
|
||||
@staticmethod
|
||||
def fetch_job_urls(credentials: Any) -> set:
|
||||
"""
|
||||
Fetches all the job urls from the google sheet to prevent duplicates
|
||||
:param credentials:
|
||||
:return: urls
|
||||
"""
|
||||
try:
|
||||
gc = gspread.authorize(credentials)
|
||||
sh = gc.open(GSHEET_NAME)
|
||||
|
||||
worksheet = sh.get_worksheet(0)
|
||||
data = worksheet.get_all_values()
|
||||
job_urls = set()
|
||||
for row in data[1:]:
|
||||
job_urls.add(row[3])
|
||||
return job_urls
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def upload_to_google_sheet(csv_data: str):
|
||||
"""
|
||||
Appends rows to google sheet
|
||||
:param csv_data:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
scope = [
|
||||
"https://www.googleapis.com/auth/spreadsheets",
|
||||
"https://www.googleapis.com/auth/drive.file",
|
||||
"https://www.googleapis.com/auth/drive",
|
||||
]
|
||||
credentials = ServiceAccountCredentials.from_json_keyfile_name(
|
||||
"client_secret.json", scope
|
||||
)
|
||||
gc = gspread.authorize(credentials)
|
||||
sh = gc.open(GSHEET_NAME)
|
||||
|
||||
worksheet = sh.get_worksheet(0)
|
||||
data_string = csv_data.getvalue()
|
||||
reader = csv.reader(StringIO(data_string))
|
||||
|
||||
job_urls = CSVFormatter.fetch_job_urls(credentials)
|
||||
|
||||
rows = list(reader)
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
if i == 0:
|
||||
continue
|
||||
if row[4] in job_urls:
|
||||
continue
|
||||
|
||||
row[6] = format(int(row[6]), ",d") if row[6] else ""
|
||||
row[7] = format(int(row[7]), ",d") if row[7] else ""
|
||||
worksheet.append_row(row)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def generate_filename() -> str:
|
||||
"""
|
||||
Adds a timestamp to the filename header
|
||||
:return: filename
|
||||
"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
return f"JobSpy_results_{timestamp}.csv"
|
||||
|
||||
@staticmethod
|
||||
def format(jobs: CommonResponse) -> StringIO:
|
||||
"""
|
||||
Transfomr the jobs objects into csv
|
||||
:param jobs:
|
||||
:return: csv
|
||||
"""
|
||||
output = StringIO()
|
||||
writer = csv.writer(output)
|
||||
|
||||
headers = [
|
||||
"Title",
|
||||
"Company Name",
|
||||
"City",
|
||||
"State",
|
||||
"Job Type",
|
||||
"Pay Cycle",
|
||||
"Min Amount",
|
||||
"Max Amount",
|
||||
"Date Posted",
|
||||
"Description",
|
||||
"Job URL",
|
||||
]
|
||||
writer.writerow(headers)
|
||||
|
||||
for site, job_response in jobs.dict().items():
|
||||
if isinstance(job_response, dict) and job_response.get("success"):
|
||||
for job in job_response["jobs"]:
|
||||
writer.writerow(
|
||||
[
|
||||
job["title"],
|
||||
job["company_name"],
|
||||
job["location"]["city"],
|
||||
job["location"]["state"],
|
||||
job["job_type"].value if job.get("job_type") else "",
|
||||
job["compensation"]["interval"].value
|
||||
if job["compensation"]
|
||||
else "",
|
||||
job["compensation"]["min_amount"]
|
||||
if job["compensation"]
|
||||
else "",
|
||||
job["compensation"]["max_amount"]
|
||||
if job["compensation"]
|
||||
else "",
|
||||
job.get("date_posted", ""),
|
||||
job["description"],
|
||||
job["job_url"],
|
||||
]
|
||||
)
|
||||
|
||||
output.seek(0)
|
||||
return output
|
|
@ -1,28 +0,0 @@
|
|||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class User(BaseModel):
|
||||
username: str
|
||||
full_name: str
|
||||
email: str
|
||||
disabled: bool = False
|
||||
|
||||
|
||||
class UserCreate(BaseModel):
|
||||
username: str
|
||||
full_name: str
|
||||
email: str
|
||||
password: str
|
||||
|
||||
|
||||
class UserInDB(User):
|
||||
hashed_password: str
|
||||
|
||||
|
||||
class TokenData(BaseModel):
|
||||
username: str
|
||||
|
||||
|
||||
class Token(BaseModel):
|
||||
access_token: str
|
||||
token_type: str
|
|
@ -1,11 +0,0 @@
|
|||
from fastapi import APIRouter, Depends
|
||||
from .jobs import router as jobs_router
|
||||
from api.auth.auth_utils import get_active_current_user
|
||||
from settings import AUTH_REQUIRED
|
||||
|
||||
if AUTH_REQUIRED:
|
||||
router = APIRouter(prefix="/v1", dependencies=[Depends(get_active_current_user)])
|
||||
else:
|
||||
router = APIRouter(prefix="/v1")
|
||||
|
||||
router.include_router(jobs_router)
|
|
@ -1,68 +0,0 @@
|
|||
import io
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import StreamingResponse
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from api.core.scrapers.indeed import IndeedScraper
|
||||
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
||||
from api.core.scrapers.linkedin import LinkedInScraper
|
||||
from api.core.formatters.csv import CSVFormatter
|
||||
from api.core.scrapers import (
|
||||
ScraperInput,
|
||||
Site,
|
||||
JobResponse,
|
||||
OutputFormat,
|
||||
CommonResponse,
|
||||
)
|
||||
from typing import List, Dict, Tuple, Union
|
||||
|
||||
router = APIRouter(prefix="/jobs", tags=["jobs"])
|
||||
|
||||
SCRAPER_MAPPING = {
|
||||
Site.LINKEDIN: LinkedInScraper,
|
||||
Site.INDEED: IndeedScraper,
|
||||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/")
|
||||
async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
||||
"""
|
||||
Asynchronously scrapes job data from multiple job sites.
|
||||
:param scraper_input:
|
||||
:return: scraper_response
|
||||
"""
|
||||
|
||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||
scraper_class = SCRAPER_MAPPING[site]
|
||||
scraper = scraper_class()
|
||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||
return (site.value, scraped_data)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
||||
scraper_response = CommonResponse(status="JSON response success", **results)
|
||||
|
||||
if scraper_input.output_format == OutputFormat.CSV:
|
||||
csv_output = CSVFormatter.format(scraper_response)
|
||||
response = StreamingResponse(csv_output, media_type="text/csv")
|
||||
response.headers[
|
||||
"Content-Disposition"
|
||||
] = f"attachment; filename={CSVFormatter.generate_filename()}"
|
||||
return response
|
||||
|
||||
elif scraper_input.output_format == OutputFormat.GSHEET:
|
||||
csv_output = CSVFormatter.format(scraper_response)
|
||||
try:
|
||||
CSVFormatter.upload_to_google_sheet(csv_output)
|
||||
return CommonResponse(
|
||||
status="Successfully uploaded to Google Sheets", **results
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return CommonResponse(
|
||||
status="Failed to upload to Google Sheet", error=repr(e), **results
|
||||
)
|
||||
|
||||
else:
|
||||
return scraper_response
|
|
@ -0,0 +1,121 @@
|
|||
import pandas as pd
|
||||
from typing import List, Dict, Tuple, Union
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from .core.jobs import JobType
|
||||
from .core.scrapers.indeed import IndeedScraper
|
||||
from .core.scrapers.ziprecruiter import ZipRecruiterScraper
|
||||
from .core.scrapers.linkedin import LinkedInScraper
|
||||
from .core.scrapers import (
|
||||
ScraperInput,
|
||||
Site,
|
||||
JobResponse,
|
||||
CommonResponse,
|
||||
)
|
||||
|
||||
|
||||
SCRAPER_MAPPING = {
|
||||
Site.LINKEDIN: LinkedInScraper,
|
||||
Site.INDEED: IndeedScraper,
|
||||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||
}
|
||||
|
||||
|
||||
def _map_str_to_site(site_name: str) -> Site:
|
||||
return Site[site_name.upper()]
|
||||
|
||||
|
||||
def scrape_jobs(
|
||||
site_name: str | Site | List[Site],
|
||||
search_term: str,
|
||||
|
||||
location: str = "",
|
||||
distance: int = None,
|
||||
is_remote: bool = False,
|
||||
job_type: JobType = None,
|
||||
easy_apply: bool = False, # linkedin
|
||||
results_wanted: int = 15
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Asynchronously scrapes job data from multiple job sites.
|
||||
:return: results_wanted: pandas dataframe containing job data
|
||||
"""
|
||||
|
||||
if type(site_name) == str:
|
||||
site_name = _map_str_to_site(site_name)
|
||||
|
||||
site_type = [site_name] if type(site_name) == Site else site_name
|
||||
scraper_input = ScraperInput(
|
||||
site_type=site_type,
|
||||
search_term=search_term,
|
||||
location=location,
|
||||
distance=distance,
|
||||
is_remote=is_remote,
|
||||
job_type=job_type,
|
||||
easy_apply=easy_apply,
|
||||
results_wanted=results_wanted,
|
||||
)
|
||||
|
||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||
scraper_class = SCRAPER_MAPPING[site]
|
||||
scraper = scraper_class()
|
||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||
|
||||
return site.value, scraped_data
|
||||
|
||||
results = {}
|
||||
for site in scraper_input.site_type:
|
||||
site_value, scraped_data = scrape_site(site)
|
||||
results[site_value] = scraped_data
|
||||
|
||||
dfs = []
|
||||
|
||||
for site, job_response in results.items():
|
||||
for job in job_response.jobs:
|
||||
data = job.dict()
|
||||
data['site'] = site
|
||||
|
||||
# Formatting JobType
|
||||
data['job_type'] = data['job_type'].value if data['job_type'] else None
|
||||
|
||||
# Formatting Location
|
||||
location_obj = data.get('location')
|
||||
if location_obj and isinstance(location_obj, dict):
|
||||
data['city'] = location_obj.get('city', '')
|
||||
data['state'] = location_obj.get('state', '')
|
||||
data['country'] = location_obj.get('country', 'USA')
|
||||
else:
|
||||
data['city'] = None
|
||||
data['state'] = None
|
||||
data['country'] = None
|
||||
|
||||
# Formatting Compensation
|
||||
compensation_obj = data.get('compensation')
|
||||
if compensation_obj and isinstance(compensation_obj, dict):
|
||||
data['interval'] = compensation_obj.get('interval').value if compensation_obj.get('interval') else None
|
||||
data['min_amount'] = compensation_obj.get('min_amount')
|
||||
data['max_amount'] = compensation_obj.get('max_amount')
|
||||
data['currency'] = compensation_obj.get('currency', 'USD')
|
||||
else:
|
||||
data['interval'] = None
|
||||
data['min_amount'] = None
|
||||
data['max_amount'] = None
|
||||
data['currency'] = None
|
||||
|
||||
job_df = pd.DataFrame([data])
|
||||
dfs.append(job_df)
|
||||
|
||||
if dfs:
|
||||
df = pd.concat(dfs, ignore_index=True)
|
||||
desired_order = ['site', 'title', 'company_name', 'city', 'state','job_type',
|
||||
'interval', 'min_amount', 'max_amount', 'job_url', 'description',]
|
||||
df = df[desired_order]
|
||||
else:
|
||||
df = pd.DataFrame()
|
||||
|
||||
return df
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Union
|
||||
from typing import Union, Optional
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
|
||||
|
@ -19,10 +19,11 @@ class JobType(Enum):
|
|||
VOLUNTEER = "volunteer"
|
||||
|
||||
|
||||
|
||||
class Location(BaseModel):
|
||||
country: str = "USA"
|
||||
city: str = None
|
||||
state: str = None
|
||||
state: Optional[str] = None
|
||||
|
||||
|
||||
class CompensationInterval(Enum):
|
||||
|
@ -35,8 +36,8 @@ class CompensationInterval(Enum):
|
|||
|
||||
class Compensation(BaseModel):
|
||||
interval: CompensationInterval
|
||||
min_amount: int
|
||||
max_amount: int
|
||||
min_amount: int = None
|
||||
max_amount: int = None
|
||||
currency: str = "USD"
|
||||
|
||||
|
||||
|
@ -44,11 +45,11 @@ class JobPost(BaseModel):
|
|||
title: str
|
||||
company_name: str
|
||||
job_url: str
|
||||
location: Location
|
||||
location: Optional[Location]
|
||||
|
||||
description: str = None
|
||||
job_type: JobType = None
|
||||
compensation: Compensation = None
|
||||
job_type: Optional[JobType] = None
|
||||
compensation: Optional[Compensation] = None
|
||||
date_posted: date = None
|
||||
|
||||
|
||||
|
@ -56,7 +57,7 @@ class JobResponse(BaseModel):
|
|||
success: bool
|
||||
error: str = None
|
||||
|
||||
total_results: int = None
|
||||
total_results: Optional[int] = None
|
||||
|
||||
jobs: list[JobPost] = []
|
||||
|
||||
|
@ -64,6 +65,11 @@ class JobResponse(BaseModel):
|
|||
|
||||
@validator("returned_results", pre=True, always=True)
|
||||
def set_returned_results(cls, v, values):
|
||||
if v is None and values.get("jobs"):
|
||||
return len(values["jobs"])
|
||||
jobs_list = values.get("jobs")
|
||||
|
||||
if v is None:
|
||||
if jobs_list is not None:
|
||||
return len(jobs_list)
|
||||
else:
|
||||
return 0
|
||||
return v
|
|
@ -1,5 +1,4 @@
|
|||
from ..jobs import *
|
||||
from ..formatters import OutputFormat
|
||||
from ..jobs import Enum, BaseModel, JobType, JobResponse
|
||||
from typing import List, Dict, Optional, Any
|
||||
|
||||
|
||||
|
@ -17,12 +16,11 @@ class Site(Enum):
|
|||
class ScraperInput(BaseModel):
|
||||
site_type: List[Site]
|
||||
search_term: str
|
||||
output_format: OutputFormat = OutputFormat.JSON
|
||||
|
||||
location: str = None
|
||||
distance: int = None
|
||||
distance: Optional[int] = None
|
||||
is_remote: bool = False
|
||||
job_type: JobType = None
|
||||
job_type: Optional[JobType] = None
|
||||
easy_apply: bool = None # linkedin
|
||||
|
||||
results_wanted: int = 15
|
|
@ -1,22 +1,18 @@
|
|||
import re
|
||||
import sys
|
||||
import math
|
||||
import json
|
||||
from typing import Optional, Tuple, List
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, List
|
||||
|
||||
import tls_client
|
||||
import urllib.parse
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from fastapi import status
|
||||
|
||||
from api.core.jobs import *
|
||||
from api.core.jobs import JobPost
|
||||
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
import math
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
|
||||
from .. import Scraper, ScraperInput, Site, StatusException
|
||||
|
||||
|
||||
class ParsingException(Exception):
|
||||
|
@ -66,8 +62,8 @@ class IndeedScraper(Scraper):
|
|||
response = session.get(self.url + "/jobs", params=params)
|
||||
|
||||
if (
|
||||
response.status_code != status.HTTP_200_OK
|
||||
and response.status_code != status.HTTP_307_TEMPORARY_REDIRECT
|
||||
response.status_code != 200
|
||||
and response.status_code != 307
|
||||
):
|
||||
raise StatusException(response.status_code)
|
||||
|
||||
|
@ -131,7 +127,6 @@ class IndeedScraper(Scraper):
|
|||
location=Location(
|
||||
city=job.get("jobLocationCity"),
|
||||
state=job.get("jobLocationState"),
|
||||
postal_code=job.get("jobLocationPostal"),
|
||||
),
|
||||
job_type=job_type,
|
||||
compensation=compensation,
|
||||
|
@ -140,9 +135,11 @@ class IndeedScraper(Scraper):
|
|||
)
|
||||
return job_post
|
||||
|
||||
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
|
||||
job_post = process_job(job)
|
||||
job_list.append(job_post)
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
job_results: list[Future] = [executor.submit(process_job, job) for job in
|
||||
jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]]
|
||||
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
|
||||
return job_list, total_num_jobs
|
||||
|
|
@ -4,10 +4,9 @@ from datetime import datetime
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from fastapi import status
|
||||
|
||||
from api.core.scrapers import Scraper, ScraperInput, Site
|
||||
from api.core.jobs import *
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ...jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval
|
||||
|
||||
|
||||
class LinkedInScraper(Scraper):
|
||||
|
@ -59,7 +58,7 @@ class LinkedInScraper(Scraper):
|
|||
f"{self.url}/jobs/search", params=params, allow_redirects=True
|
||||
)
|
||||
|
||||
if response.status_code != status.HTTP_200_OK:
|
||||
if response.status_code != 200:
|
||||
return JobResponse(
|
||||
success=False,
|
||||
error=f"Response returned {response.status_code}",
|
||||
|
@ -118,6 +117,7 @@ class LinkedInScraper(Scraper):
|
|||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
job_type=job_type,
|
||||
compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD")
|
||||
)
|
||||
job_list.append(job_post)
|
||||
if (
|
||||
|
@ -185,7 +185,6 @@ class LinkedInScraper(Scraper):
|
|||
employment_type = employment_type_span.get_text(strip=True)
|
||||
employment_type = employment_type.lower()
|
||||
employment_type = employment_type.replace("-", "")
|
||||
print(employment_type)
|
||||
|
||||
return JobType(employment_type)
|
||||
|
|
@ -1,18 +1,17 @@
|
|||
import math
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, List
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
import tls_client
|
||||
from fastapi import status
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
|
||||
from api.core.jobs import JobPost
|
||||
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
|
||||
from api.core.jobs import *
|
||||
from .. import Scraper, ScraperInput, Site, StatusException
|
||||
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
|
@ -26,9 +25,12 @@ class ZipRecruiterScraper(Scraper):
|
|||
|
||||
self.jobs_per_page = 20
|
||||
self.seen_urls = set()
|
||||
self.session = tls_client.Session(
|
||||
client_identifier="chrome112", random_tls_extension_order=True
|
||||
)
|
||||
|
||||
def scrape_page(
|
||||
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
|
||||
self, scraper_input: ScraperInput, page: int
|
||||
) -> tuple[list[JobPost], int | None]:
|
||||
"""
|
||||
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
||||
|
@ -52,91 +54,47 @@ class ZipRecruiterScraper(Scraper):
|
|||
params = {
|
||||
"search": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
"radius": scraper_input.distance,
|
||||
"refine_by_location_type": "only_remote"
|
||||
if scraper_input.is_remote
|
||||
else None,
|
||||
"refine_by_employment": f"employment_type:employment_type:{job_type_value}"
|
||||
if job_type_value
|
||||
else None,
|
||||
"page": page,
|
||||
"form": "jobs-landing"
|
||||
}
|
||||
|
||||
response = session.get(
|
||||
if scraper_input.is_remote:
|
||||
params["refine_by_location_type"] = "only_remote"
|
||||
|
||||
if scraper_input.distance:
|
||||
params["radius"] = scraper_input.distance
|
||||
|
||||
if job_type_value:
|
||||
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
|
||||
|
||||
response = self.session.get(
|
||||
self.url + "/jobs-search",
|
||||
headers=ZipRecruiterScraper.headers(),
|
||||
params=params,
|
||||
)
|
||||
|
||||
if response.status_code != status.HTTP_200_OK:
|
||||
if response.status_code != 200:
|
||||
raise StatusException(response.status_code)
|
||||
|
||||
html_string = response.content
|
||||
html_string = response.text
|
||||
soup = BeautifulSoup(html_string, "html.parser")
|
||||
|
||||
if page == 1:
|
||||
script_tag = soup.find("script", {"id": "js_variables"})
|
||||
data = json.loads(script_tag.string)
|
||||
script_tag = soup.find("script", {"id": "js_variables"})
|
||||
data = json.loads(script_tag.string)
|
||||
|
||||
if page == 1:
|
||||
job_count = int(data["totalJobCount"].replace(",", ""))
|
||||
else:
|
||||
job_count = None
|
||||
|
||||
job_posts = soup.find_all("div", {"class": "job_content"})
|
||||
|
||||
def process_job(job: Tag) -> Optional[JobPost]:
|
||||
"""
|
||||
Parses a job from the job content tag
|
||||
:param job: BeautifulSoup Tag for one job post
|
||||
:return JobPost
|
||||
"""
|
||||
job_url = job.find("a", {"class": "job_link"})["href"]
|
||||
if job_url in self.seen_urls:
|
||||
return None
|
||||
|
||||
title = job.find("h2", {"class": "title"}).text
|
||||
company = job.find("a", {"class": "company_name"}).text.strip()
|
||||
|
||||
description, updated_job_url = ZipRecruiterScraper.get_description(
|
||||
job_url, session
|
||||
)
|
||||
if updated_job_url is not None:
|
||||
job_url = updated_job_url
|
||||
if description is None:
|
||||
description = job.find("p", {"class": "job_snippet"}).text.strip()
|
||||
|
||||
job_type_element = job.find("li", {"class": "perk_item perk_type"})
|
||||
if job_type_element:
|
||||
job_type_text = (
|
||||
job_type_element.text.strip()
|
||||
.lower()
|
||||
.replace("-", "")
|
||||
.replace(" ", "")
|
||||
)
|
||||
if job_type_text == "contractor":
|
||||
job_type_text = "contract"
|
||||
job_type = JobType(job_type_text)
|
||||
else:
|
||||
job_type = None
|
||||
|
||||
date_posted = ZipRecruiterScraper.get_date_posted(job)
|
||||
|
||||
job_post = JobPost(
|
||||
title=title,
|
||||
description=description,
|
||||
company_name=company,
|
||||
location=ZipRecruiterScraper.get_location(job),
|
||||
job_type=job_type,
|
||||
compensation=ZipRecruiterScraper.get_compensation(job),
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
)
|
||||
return job_post
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
job_results: list[Future] = [
|
||||
executor.submit(process_job, job) for job in job_posts
|
||||
]
|
||||
if "jobList" in data and data["jobList"]:
|
||||
jobs_js = data["jobList"]
|
||||
job_results = [executor.submit(self.process_job_js, job) for job in jobs_js]
|
||||
else:
|
||||
jobs_html = soup.find_all("div", {"class": "job_content"})
|
||||
job_results = [executor.submit(self.process_job_html, job) for job in
|
||||
jobs_html]
|
||||
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
|
||||
|
@ -148,19 +106,17 @@ class ZipRecruiterScraper(Scraper):
|
|||
:param scraper_input:
|
||||
:return: job_response
|
||||
"""
|
||||
session = tls_client.Session(
|
||||
client_identifier="chrome112", random_tls_extension_order=True
|
||||
)
|
||||
|
||||
pages_to_process = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
|
||||
|
||||
pages_to_process = max(3, math.ceil(scraper_input.results_wanted / self.jobs_per_page))
|
||||
|
||||
try:
|
||||
#: get first page to initialize session
|
||||
job_list, total_results = self.scrape_page(scraper_input, 1, session)
|
||||
job_list, total_results = self.scrape_page(scraper_input, 1)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
futures: list[Future] = [
|
||||
executor.submit(self.scrape_page, scraper_input, page, session)
|
||||
executor.submit(self.scrape_page, scraper_input, page)
|
||||
for page in range(2, pages_to_process + 1)
|
||||
]
|
||||
|
||||
|
@ -169,6 +125,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
|
||||
job_list += jobs
|
||||
|
||||
|
||||
except StatusException as e:
|
||||
return JobResponse(
|
||||
success=False,
|
||||
|
@ -192,9 +149,129 @@ class ZipRecruiterScraper(Scraper):
|
|||
)
|
||||
return job_response
|
||||
|
||||
def process_job_html(self, job: Tag) -> Optional[JobPost]:
|
||||
"""
|
||||
Parses a job from the job content tag
|
||||
:param job: BeautifulSoup Tag for one job post
|
||||
:return JobPost
|
||||
"""
|
||||
job_url = job.find("a", {"class": "job_link"})["href"]
|
||||
if job_url in self.seen_urls:
|
||||
return None
|
||||
|
||||
title = job.find("h2", {"class": "title"}).text
|
||||
company = job.find("a", {"class": "company_name"}).text.strip()
|
||||
|
||||
description, updated_job_url = self.get_description(
|
||||
job_url
|
||||
)
|
||||
if updated_job_url is not None:
|
||||
job_url = updated_job_url
|
||||
if description is None:
|
||||
description = job.find("p", {"class": "job_snippet"}).text.strip()
|
||||
|
||||
job_type_element = job.find("li", {"class": "perk_item perk_type"})
|
||||
if job_type_element:
|
||||
job_type_text = (
|
||||
job_type_element.text.strip()
|
||||
.lower()
|
||||
.replace("-", "")
|
||||
.replace(" ", "")
|
||||
)
|
||||
if job_type_text == "contractor":
|
||||
job_type_text = "contract"
|
||||
job_type = JobType(job_type_text)
|
||||
else:
|
||||
job_type = None
|
||||
|
||||
date_posted = ZipRecruiterScraper.get_date_posted(job)
|
||||
|
||||
job_post = JobPost(
|
||||
title=title,
|
||||
description=description,
|
||||
company_name=company,
|
||||
location=ZipRecruiterScraper.get_location(job),
|
||||
job_type=job_type,
|
||||
compensation=ZipRecruiterScraper.get_compensation(job),
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
)
|
||||
return job_post
|
||||
|
||||
def process_job_js(self, job: dict) -> JobPost:
|
||||
# Map the job data to the expected fields by the Pydantic model
|
||||
title = job.get("Title")
|
||||
description = BeautifulSoup(job.get("Snippet","").strip(), "html.parser").get_text()
|
||||
|
||||
company = job.get("OrgName")
|
||||
location = Location(city=job.get("City"), state=job.get("State"))
|
||||
try:
|
||||
job_type = ZipRecruiterScraper.job_type_from_string(job.get("EmploymentType", "").replace("-", "_").lower())
|
||||
except ValueError:
|
||||
# print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}")
|
||||
return None
|
||||
|
||||
formatted_salary = job.get("FormattedSalaryShort", "")
|
||||
salary_parts = formatted_salary.split(" ")
|
||||
|
||||
min_salary_str = salary_parts[0][1:].replace(",", "")
|
||||
if '.' in min_salary_str:
|
||||
min_amount = int(float(min_salary_str) * 1000)
|
||||
else:
|
||||
min_amount = int(min_salary_str.replace("K", "000"))
|
||||
|
||||
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
|
||||
max_salary_str = salary_parts[2][1:].replace(",", "")
|
||||
if '.' in max_salary_str:
|
||||
max_amount = int(float(max_salary_str) * 1000)
|
||||
else:
|
||||
max_amount = int(max_salary_str.replace("K", "000"))
|
||||
else:
|
||||
max_amount = 0
|
||||
|
||||
compensation = Compensation(
|
||||
interval=CompensationInterval.YEARLY,
|
||||
min_amount=min_amount,
|
||||
max_amount=max_amount
|
||||
)
|
||||
save_job_url = job.get("SaveJobURL", "")
|
||||
posted_time_match = re.search(r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url)
|
||||
if posted_time_match:
|
||||
date_time_str = posted_time_match.group(1)
|
||||
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
|
||||
date_posted = date_posted_obj.date()
|
||||
else:
|
||||
date_posted = date.today()
|
||||
job_url = job.get("JobURL")
|
||||
|
||||
return JobPost(
|
||||
title=title,
|
||||
description=description,
|
||||
company_name=company,
|
||||
location=location,
|
||||
job_type=job_type,
|
||||
compensation=compensation,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
)
|
||||
return job_post
|
||||
|
||||
@staticmethod
|
||||
def job_type_from_string(value: str) -> Optional[JobType]:
|
||||
if not value:
|
||||
return None
|
||||
|
||||
if value.lower() == "contractor":
|
||||
value = "contract"
|
||||
normalized_value = value.replace("_", "")
|
||||
for item in JobType:
|
||||
if item.value == normalized_value:
|
||||
return item
|
||||
raise ValueError(f"Invalid value for JobType: {value}")
|
||||
|
||||
def get_description(
|
||||
job_page_url: str, session: tls_client.Session
|
||||
self,
|
||||
job_page_url: str
|
||||
) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Retrieves job description by going to the job page url
|
||||
|
@ -202,7 +279,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
:param session:
|
||||
:return: description or None, response url
|
||||
"""
|
||||
response = session.get(
|
||||
response = self.session.get(
|
||||
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
|
||||
)
|
||||
if response.status_code not in range(200, 400):
|
16
main.py
16
main.py
|
@ -1,16 +0,0 @@
|
|||
from fastapi import FastAPI
|
||||
|
||||
from supabase_py import create_client, Client
|
||||
from api import router as api_router
|
||||
|
||||
app = FastAPI(
|
||||
title="JobSpy Backend",
|
||||
description="Endpoints for job boardLinkedIn, Indeed, and ZipRecruiterscrapers",
|
||||
version="1.0.0",
|
||||
)
|
||||
app.include_router(api_router)
|
||||
|
||||
|
||||
@app.get("/health", tags=["health"])
|
||||
async def health_check():
|
||||
return {"message": "JobSpy ready to scrape"}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -1,15 +0,0 @@
|
|||
{
|
||||
"id": "a7ea6d58-8dca-4216-97a9-224dadc1e18f",
|
||||
"name": "JobSpy",
|
||||
"values": [
|
||||
{
|
||||
"key": "access_token",
|
||||
"value": "",
|
||||
"type": "any",
|
||||
"enabled": true
|
||||
}
|
||||
],
|
||||
"_postman_variable_scope": "environment",
|
||||
"_postman_exported_at": "2023-07-09T23:51:36.709Z",
|
||||
"_postman_exported_using": "Postman/10.15.8"
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
[tool.poetry]
|
||||
name = "jobscrape"
|
||||
version = "0.1.0"
|
||||
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <69336300+ZacharyHampton@users.noreply.github.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
requests = "^2.31.0"
|
||||
tls-client = "^0.2.1"
|
||||
beautifulsoup4 = "^4.12.2"
|
||||
pandas = "^2.1.0"
|
||||
pydantic = "^2.3.0"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.4.1"
|
||||
jupyter = "^1.0.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
|
@ -1,61 +0,0 @@
|
|||
anyio==3.7.1
|
||||
atomicwrites==1.4.1
|
||||
attrs==23.1.0
|
||||
bcrypt==4.0.1
|
||||
beautifulsoup4==4.12.2
|
||||
cachetools==5.3.1
|
||||
certifi==2023.5.7
|
||||
cffi==1.15.1
|
||||
chardet==4.0.0
|
||||
charset-normalizer==3.2.0
|
||||
click==8.1.4
|
||||
colorama==0.4.6
|
||||
cryptography==41.0.1
|
||||
dataclasses==0.6
|
||||
deprecation==2.1.0
|
||||
ecdsa==0.18.0
|
||||
exceptiongroup==1.1.2
|
||||
fastapi==0.99.1
|
||||
google-auth==2.22.0
|
||||
google-auth-oauthlib==1.0.0
|
||||
gotrue==0.2.0
|
||||
gspread==5.10.0
|
||||
h11==0.14.0
|
||||
httpcore==0.12.3
|
||||
httplib2==0.22.0
|
||||
httpx==0.16.1
|
||||
idna==2.10
|
||||
iniconfig==2.0.0
|
||||
oauth2client==4.1.3
|
||||
oauthlib==3.2.2
|
||||
packaging==23.1
|
||||
passlib==1.7.4
|
||||
pluggy==1.2.0
|
||||
postgrest-py==0.4.0
|
||||
py==1.11.0
|
||||
pyasn1==0.5.0
|
||||
pyasn1-modules==0.3.0
|
||||
pycparser==2.21
|
||||
pydantic==1.10.11
|
||||
pyparsing==3.1.1
|
||||
pytest==6.2.5
|
||||
python-dateutil==2.8.2
|
||||
python-dotenv==1.0.0
|
||||
python-jose==3.3.0
|
||||
python-multipart==0.0.6
|
||||
realtime-py==0.1.3
|
||||
requests==2.25.1
|
||||
requests-oauthlib==1.3.1
|
||||
rfc3986==1.5.0
|
||||
rsa==4.9
|
||||
six==1.16.0
|
||||
sniffio==1.3.0
|
||||
soupsieve==2.4.1
|
||||
starlette==0.27.0
|
||||
supabase-py==0.0.2
|
||||
tls-client==0.2.1
|
||||
toml==0.10.2
|
||||
typing_extensions==4.7.1
|
||||
urllib3==1.26.16
|
||||
uvicorn==0.22.0
|
||||
websockets==9.1
|
14
settings.py
14
settings.py
|
@ -1,14 +0,0 @@
|
|||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
# gsheets (template to copy at https://docs.google.com/spreadsheets/d/1mOgb-ZGZy_YIhnW9OCqIVvkFwiKFvhMBjNcbakW7BLo/edit?usp=sharing)
|
||||
GSHEET_NAME = os.environ.get("GSHEET_NAME", "JobSpy")
|
||||
|
||||
# optional autha
|
||||
AUTH_REQUIRED = False
|
||||
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
||||
SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
|
||||
JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY")
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
||||
ALGORITHM = "HS256"
|
|
@ -0,0 +1,10 @@
|
|||
from jobscrape import scrape_jobs
|
||||
|
||||
|
||||
def test_indeed():
|
||||
result = scrape_jobs(
|
||||
site_name="indeed",
|
||||
search_term="software engineer",
|
||||
)
|
||||
|
||||
assert result is not None
|
|
@ -0,0 +1,10 @@
|
|||
from jobscrape import scrape_jobs
|
||||
|
||||
|
||||
def test_ziprecruiter():
|
||||
result = scrape_jobs(
|
||||
site_name="zip_recruiter",
|
||||
search_term="software engineer",
|
||||
)
|
||||
|
||||
assert result is not None
|
Loading…
Reference in New Issue