mirror of https://github.com/Bunsly/JobSpy
Merge pull request #3 from JobSpy-ai/feature/scrapers/Add-LinkedIn-scraper
feat: add LinkedIn scraperpull/12/head
commit
34ca9daefd
|
@ -1,12 +1,13 @@
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import math
|
from math import ceil
|
||||||
|
|
||||||
import tls_client
|
import tls_client
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from api.core.scrapers import Scraper, ScraperInput, Site
|
||||||
from ...jobs import *
|
from api.core.jobs import *
|
||||||
|
from api.core.utils import handle_response
|
||||||
|
|
||||||
|
|
||||||
class IndeedScraper(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
|
@ -28,14 +29,15 @@ class IndeedScraper(Scraper):
|
||||||
}
|
}
|
||||||
|
|
||||||
response = session.get(self.url, params=params)
|
response = session.get(self.url, params=params)
|
||||||
if response.status_code != 200:
|
success, result = handle_response(response)
|
||||||
return {"message": f"Error - Status Code: {response.status_code}"}
|
if not success:
|
||||||
|
return result
|
||||||
|
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
|
||||||
jobs = IndeedScraper.parse_jobs(soup)
|
jobs = IndeedScraper.parse_jobs(soup)
|
||||||
total_num_jobs = IndeedScraper.total_jobs(soup)
|
total_num_jobs = IndeedScraper.total_jobs(soup)
|
||||||
total_pages = math.ceil(total_num_jobs / 15)
|
total_pages = ceil(total_num_jobs / 15)
|
||||||
|
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
# page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"]
|
# page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"]
|
||||||
|
@ -75,7 +77,6 @@ class IndeedScraper(Scraper):
|
||||||
title=job["normTitle"],
|
title=job["normTitle"],
|
||||||
description=first_li.text if first_li else None,
|
description=first_li.text if first_li else None,
|
||||||
company_name=job["company"],
|
company_name=job["company"],
|
||||||
industry=None,
|
|
||||||
location=Location(
|
location=Location(
|
||||||
city=job["jobLocationCity"],
|
city=job["jobLocationCity"],
|
||||||
state=job["jobLocationState"],
|
state=job["jobLocationState"],
|
||||||
|
|
|
@ -0,0 +1,95 @@
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from api.core.scrapers import Scraper, ScraperInput, Site
|
||||||
|
from api.core.jobs import *
|
||||||
|
from api.core.utils import handle_response
|
||||||
|
|
||||||
|
|
||||||
|
class LinkedInScraper(Scraper):
|
||||||
|
def __init__(self):
|
||||||
|
site = Site(Site.LINKEDIN)
|
||||||
|
super().__init__(site)
|
||||||
|
|
||||||
|
self.url = "https://www.linkedin.com/jobs"
|
||||||
|
|
||||||
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
|
params = {"pageNum": scraper_input.page - 1, "location": scraper_input.location}
|
||||||
|
|
||||||
|
self.url = f"{self.url}/{scraper_input.search_term}-jobs"
|
||||||
|
response = requests.get(self.url, params=params)
|
||||||
|
success, result = handle_response(response)
|
||||||
|
if not success:
|
||||||
|
return result
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
job_list: list[JobPost] = []
|
||||||
|
for job_card in soup.find_all(
|
||||||
|
"div",
|
||||||
|
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
|
||||||
|
):
|
||||||
|
job_url_tag = job_card.find("a", class_="base-card__full-link")
|
||||||
|
job_url = job_url_tag["href"] if job_url_tag else "N/A"
|
||||||
|
|
||||||
|
job_info = job_card.find("div", class_="base-search-card__info")
|
||||||
|
if job_info is not None:
|
||||||
|
title_tag = job_info.find("h3", class_="base-search-card__title")
|
||||||
|
title = title_tag.text.strip() if title_tag else "N/A"
|
||||||
|
|
||||||
|
company_tag = job_info.find("a", class_="hidden-nested-link")
|
||||||
|
company = company_tag.text.strip() if company_tag else "N/A"
|
||||||
|
|
||||||
|
metadata_card = job_info.find(
|
||||||
|
"div", class_="base-search-card__metadata"
|
||||||
|
)
|
||||||
|
location: Location = LinkedInScraper.get_location(metadata_card)
|
||||||
|
|
||||||
|
datetime_tag = metadata_card.find(
|
||||||
|
"time", class_="job-search-card__listdate"
|
||||||
|
)
|
||||||
|
if datetime_tag:
|
||||||
|
datetime_str = datetime_tag["datetime"]
|
||||||
|
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
|
||||||
|
|
||||||
|
job_post = JobPost(
|
||||||
|
title=title,
|
||||||
|
company_name=company,
|
||||||
|
location=location,
|
||||||
|
date_posted=date_posted,
|
||||||
|
delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
|
||||||
|
)
|
||||||
|
job_list.append(job_post)
|
||||||
|
|
||||||
|
job_count_text = soup.find(
|
||||||
|
"span", class_="results-context-header__job-count"
|
||||||
|
).text
|
||||||
|
job_count = int("".join(filter(str.isdigit, job_count_text)))
|
||||||
|
total_pages = ceil(job_count / 25)
|
||||||
|
job_response = JobResponse(
|
||||||
|
jobs=job_list,
|
||||||
|
job_count=job_count,
|
||||||
|
page=scraper_input.page,
|
||||||
|
total_pages=total_pages,
|
||||||
|
)
|
||||||
|
return job_response
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_location(metadata_card):
|
||||||
|
if metadata_card is not None:
|
||||||
|
location_tag = metadata_card.find(
|
||||||
|
"span", class_="job-search-card__location"
|
||||||
|
)
|
||||||
|
location_string = location_tag.text.strip() if location_tag else "N/A"
|
||||||
|
parts = location_string.split(", ")
|
||||||
|
if len(parts) == 2:
|
||||||
|
city, state = parts
|
||||||
|
location = Location(
|
||||||
|
country="US",
|
||||||
|
city=city,
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
|
||||||
|
return location
|
|
@ -4,13 +4,14 @@ from urllib.parse import urlparse, parse_qs
|
||||||
import tls_client
|
import tls_client
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from api.core.scrapers import Scraper, ScraperInput, Site
|
||||||
from ...jobs import *
|
from api.core.jobs import *
|
||||||
|
from api.core.utils import handle_response
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
site = Site(Site.INDEED)
|
site = Site(Site.ZIP_RECRUITER)
|
||||||
super().__init__(site)
|
super().__init__(site)
|
||||||
|
|
||||||
self.url = "https://www.ziprecruiter.com/jobs-search"
|
self.url = "https://www.ziprecruiter.com/jobs-search"
|
||||||
|
@ -23,14 +24,15 @@ class ZipRecruiterScraper(Scraper):
|
||||||
params = {
|
params = {
|
||||||
"search": scraper_input.search_term,
|
"search": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"page": min(scraper_input.page if scraper_input.page else 1, 10),
|
"page": min(scraper_input.page, 10),
|
||||||
}
|
}
|
||||||
|
|
||||||
response = session.get(
|
response = session.get(
|
||||||
self.url, headers=ZipRecruiterScraper.headers(), params=params
|
self.url, headers=ZipRecruiterScraper.headers(), params=params
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
success, result = handle_response(response)
|
||||||
return {"message": f"Error - Status Code: {response.status_code}"}
|
if not success:
|
||||||
|
return result
|
||||||
|
|
||||||
html_string = response.content
|
html_string = response.content
|
||||||
soup = BeautifulSoup(html_string, "html.parser")
|
soup = BeautifulSoup(html_string, "html.parser")
|
||||||
|
@ -57,7 +59,6 @@ class ZipRecruiterScraper(Scraper):
|
||||||
title=title,
|
title=title,
|
||||||
description=description,
|
description=description,
|
||||||
company_name=company,
|
company_name=company,
|
||||||
industry=None,
|
|
||||||
location=ZipRecruiterScraper.get_location(job),
|
location=ZipRecruiterScraper.get_location(job),
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=ZipRecruiterScraper.get_compensation(job),
|
compensation=ZipRecruiterScraper.get_compensation(job),
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
def handle_response(response):
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
return True, response.json()
|
||||||
|
except ValueError:
|
||||||
|
return True, response.text
|
||||||
|
|
||||||
|
try:
|
||||||
|
error_msg = response.json().get("message", "No detailed message provided.")
|
||||||
|
except ValueError:
|
||||||
|
error_msg = "No detailed message provided."
|
||||||
|
|
||||||
|
error = {
|
||||||
|
"message": "An error occurred during the request.",
|
||||||
|
"status_code": response.status_code,
|
||||||
|
"url": response.url,
|
||||||
|
"details": error_msg,
|
||||||
|
}
|
||||||
|
|
||||||
|
return False, error
|
|
@ -2,8 +2,8 @@ from fastapi import APIRouter
|
||||||
|
|
||||||
from api.core.scrapers.indeed import IndeedScraper
|
from api.core.scrapers.indeed import IndeedScraper
|
||||||
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
|
from api.core.scrapers.linkedin import LinkedInScraper
|
||||||
from api.core.scrapers import ScraperInput
|
from api.core.scrapers import ScraperInput
|
||||||
from api.core.jobs import JobResponse
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/jobs")
|
router = APIRouter(prefix="/jobs")
|
||||||
|
|
||||||
|
@ -13,17 +13,19 @@ async def scrape_jobs(
|
||||||
site_type: str, search_term: str, location: str, page: int = None
|
site_type: str, search_term: str, location: str, page: int = None
|
||||||
):
|
):
|
||||||
job_response = {"message": "site type not found"}
|
job_response = {"message": "site type not found"}
|
||||||
if site_type == "indeed":
|
|
||||||
indeed_scraper = IndeedScraper()
|
scraper_dict = {
|
||||||
|
"indeed": IndeedScraper,
|
||||||
|
"linkedin": LinkedInScraper,
|
||||||
|
"zip": ZipRecruiterScraper,
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper_class = scraper_dict.get(site_type)
|
||||||
|
if scraper_class:
|
||||||
|
scraper = scraper_class()
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
search_term=search_term, location=location, page=page
|
search_term=search_term, location=location, page=page
|
||||||
)
|
)
|
||||||
job_response = indeed_scraper.scrape(scraper_input)
|
job_response = scraper.scrape(scraper_input)
|
||||||
elif site_type == "zip":
|
|
||||||
ziprecruiter_scraper = ZipRecruiterScraper()
|
|
||||||
scraper_input = ScraperInput(
|
|
||||||
search_term=search_term, location=location, page=page
|
|
||||||
)
|
|
||||||
job_response = ziprecruiter_scraper.scrape(scraper_input)
|
|
||||||
|
|
||||||
return job_response
|
return job_response
|
||||||
|
|
3
main.py
3
main.py
|
@ -1,4 +1,5 @@
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
|
||||||
from api import router as api_router
|
from api import router as api_router
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
@ -7,4 +8,4 @@ app.include_router(api_router)
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def root():
|
async def root():
|
||||||
return {"message": "Hello World"}
|
return {"message": "JobSpy Backend"}
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
fastapi~=0.99.1
|
||||||
|
pydantic~=1.10.11
|
||||||
|
beautifulsoup4~=4.12.2
|
||||||
|
requests~=2.31.0
|
||||||
|
pip~=21.3.1
|
||||||
|
wheel~=0.37.1
|
||||||
|
setuptools~=60.2.0
|
Loading…
Reference in New Issue