Merge pull request #3 from JobSpy-ai/feature/scrapers/Add-LinkedIn-scraper

feat: add LinkedIn scraper
pull/12/head
Cullen 2023-07-08 09:36:06 -05:00 committed by GitHub
commit 34ca9daefd
7 changed files with 152 additions and 25 deletions

View File

@ -1,12 +1,13 @@
import json
import re
import math
from math import ceil
import tls_client
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site
from ...jobs import *
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import *
from api.core.utils import handle_response
class IndeedScraper(Scraper):
@ -28,14 +29,15 @@ class IndeedScraper(Scraper):
}
response = session.get(self.url, params=params)
if response.status_code != 200:
return {"message": f"Error - Status Code: {response.status_code}"}
success, result = handle_response(response)
if not success:
return result
soup = BeautifulSoup(response.content, "html.parser")
jobs = IndeedScraper.parse_jobs(soup)
total_num_jobs = IndeedScraper.total_jobs(soup)
total_pages = math.ceil(total_num_jobs / 15)
total_pages = ceil(total_num_jobs / 15)
job_list: list[JobPost] = []
# page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"]
@ -75,7 +77,6 @@ class IndeedScraper(Scraper):
title=job["normTitle"],
description=first_li.text if first_li else None,
company_name=job["company"],
industry=None,
location=Location(
city=job["jobLocationCity"],
state=job["jobLocationState"],

View File

@ -0,0 +1,95 @@
from math import ceil
import requests
from bs4 import BeautifulSoup
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import *
from api.core.utils import handle_response
class LinkedInScraper(Scraper):
def __init__(self):
site = Site(Site.LINKEDIN)
super().__init__(site)
self.url = "https://www.linkedin.com/jobs"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
params = {"pageNum": scraper_input.page - 1, "location": scraper_input.location}
self.url = f"{self.url}/{scraper_input.search_term}-jobs"
response = requests.get(self.url, params=params)
success, result = handle_response(response)
if not success:
return result
soup = BeautifulSoup(response.text, "html.parser")
job_list: list[JobPost] = []
for job_card in soup.find_all(
"div",
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
):
job_url_tag = job_card.find("a", class_="base-card__full-link")
job_url = job_url_tag["href"] if job_url_tag else "N/A"
job_info = job_card.find("div", class_="base-search-card__info")
if job_info is not None:
title_tag = job_info.find("h3", class_="base-search-card__title")
title = title_tag.text.strip() if title_tag else "N/A"
company_tag = job_info.find("a", class_="hidden-nested-link")
company = company_tag.text.strip() if company_tag else "N/A"
metadata_card = job_info.find(
"div", class_="base-search-card__metadata"
)
location: Location = LinkedInScraper.get_location(metadata_card)
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate"
)
if datetime_tag:
datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
job_post = JobPost(
title=title,
company_name=company,
location=location,
date_posted=date_posted,
delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
)
job_list.append(job_post)
job_count_text = soup.find(
"span", class_="results-context-header__job-count"
).text
job_count = int("".join(filter(str.isdigit, job_count_text)))
total_pages = ceil(job_count / 25)
job_response = JobResponse(
jobs=job_list,
job_count=job_count,
page=scraper_input.page,
total_pages=total_pages,
)
return job_response
@staticmethod
def get_location(metadata_card):
if metadata_card is not None:
location_tag = metadata_card.find(
"span", class_="job-search-card__location"
)
location_string = location_tag.text.strip() if location_tag else "N/A"
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
location = Location(
country="US",
city=city,
state=state,
)
return location

View File

@ -4,13 +4,14 @@ from urllib.parse import urlparse, parse_qs
import tls_client
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site
from ...jobs import *
from api.core.scrapers import Scraper, ScraperInput, Site
from api.core.jobs import *
from api.core.utils import handle_response
class ZipRecruiterScraper(Scraper):
def __init__(self):
site = Site(Site.INDEED)
site = Site(Site.ZIP_RECRUITER)
super().__init__(site)
self.url = "https://www.ziprecruiter.com/jobs-search"
@ -23,14 +24,15 @@ class ZipRecruiterScraper(Scraper):
params = {
"search": scraper_input.search_term,
"location": scraper_input.location,
"page": min(scraper_input.page if scraper_input.page else 1, 10),
"page": min(scraper_input.page, 10),
}
response = session.get(
self.url, headers=ZipRecruiterScraper.headers(), params=params
)
if response.status_code != 200:
return {"message": f"Error - Status Code: {response.status_code}"}
success, result = handle_response(response)
if not success:
return result
html_string = response.content
soup = BeautifulSoup(html_string, "html.parser")
@ -57,7 +59,6 @@ class ZipRecruiterScraper(Scraper):
title=title,
description=description,
company_name=company,
industry=None,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),

20
api/core/utils.py Normal file
View File

@ -0,0 +1,20 @@
def handle_response(response):
if response.status_code == 200:
try:
return True, response.json()
except ValueError:
return True, response.text
try:
error_msg = response.json().get("message", "No detailed message provided.")
except ValueError:
error_msg = "No detailed message provided."
error = {
"message": "An error occurred during the request.",
"status_code": response.status_code,
"url": response.url,
"details": error_msg,
}
return False, error

View File

@ -2,8 +2,8 @@ from fastapi import APIRouter
from api.core.scrapers.indeed import IndeedScraper
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
from api.core.scrapers.linkedin import LinkedInScraper
from api.core.scrapers import ScraperInput
from api.core.jobs import JobResponse
router = APIRouter(prefix="/jobs")
@ -13,17 +13,19 @@ async def scrape_jobs(
site_type: str, search_term: str, location: str, page: int = None
):
job_response = {"message": "site type not found"}
if site_type == "indeed":
indeed_scraper = IndeedScraper()
scraper_dict = {
"indeed": IndeedScraper,
"linkedin": LinkedInScraper,
"zip": ZipRecruiterScraper,
}
scraper_class = scraper_dict.get(site_type)
if scraper_class:
scraper = scraper_class()
scraper_input = ScraperInput(
search_term=search_term, location=location, page=page
)
job_response = indeed_scraper.scrape(scraper_input)
elif site_type == "zip":
ziprecruiter_scraper = ZipRecruiterScraper()
scraper_input = ScraperInput(
search_term=search_term, location=location, page=page
)
job_response = ziprecruiter_scraper.scrape(scraper_input)
job_response = scraper.scrape(scraper_input)
return job_response

View File

@ -1,4 +1,5 @@
from fastapi import FastAPI
from api import router as api_router
app = FastAPI()
@ -7,4 +8,4 @@ app.include_router(api_router)
@app.get("/")
async def root():
return {"message": "Hello World"}
return {"message": "JobSpy Backend"}

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
fastapi~=0.99.1
pydantic~=1.10.11
beautifulsoup4~=4.12.2
requests~=2.31.0
pip~=21.3.1
wheel~=0.37.1
setuptools~=60.2.0