diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 90500a1..eb200c3 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -1,12 +1,13 @@ import json import re -import math +from math import ceil import tls_client from bs4 import BeautifulSoup -from .. import Scraper, ScraperInput, Site -from ...jobs import * +from api.core.scrapers import Scraper, ScraperInput, Site +from api.core.jobs import * +from api.core.utils import handle_response class IndeedScraper(Scraper): @@ -28,14 +29,15 @@ class IndeedScraper(Scraper): } response = session.get(self.url, params=params) - if response.status_code != 200: - return {"message": f"Error - Status Code: {response.status_code}"} + success, result = handle_response(response) + if not success: + return result soup = BeautifulSoup(response.content, "html.parser") jobs = IndeedScraper.parse_jobs(soup) total_num_jobs = IndeedScraper.total_jobs(soup) - total_pages = math.ceil(total_num_jobs / 15) + total_pages = ceil(total_num_jobs / 15) job_list: list[JobPost] = [] # page_number = jobs["metaData"]["mosaicProviderJobCardsModel"]["pageNumber"] @@ -75,7 +77,6 @@ class IndeedScraper(Scraper): title=job["normTitle"], description=first_li.text if first_li else None, company_name=job["company"], - industry=None, location=Location( city=job["jobLocationCity"], state=job["jobLocationState"], diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py new file mode 100644 index 0000000..6ebde13 --- /dev/null +++ b/api/core/scrapers/linkedin/__init__.py @@ -0,0 +1,95 @@ +from math import ceil + +import requests +from bs4 import BeautifulSoup + +from api.core.scrapers import Scraper, ScraperInput, Site +from api.core.jobs import * +from api.core.utils import handle_response + + +class LinkedInScraper(Scraper): + def __init__(self): + site = Site(Site.LINKEDIN) + super().__init__(site) + + self.url = "https://www.linkedin.com/jobs" + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + params = {"pageNum": scraper_input.page - 1, "location": scraper_input.location} + + self.url = f"{self.url}/{scraper_input.search_term}-jobs" + response = requests.get(self.url, params=params) + success, result = handle_response(response) + if not success: + return result + + soup = BeautifulSoup(response.text, "html.parser") + + job_list: list[JobPost] = [] + for job_card in soup.find_all( + "div", + class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + ): + job_url_tag = job_card.find("a", class_="base-card__full-link") + job_url = job_url_tag["href"] if job_url_tag else "N/A" + + job_info = job_card.find("div", class_="base-search-card__info") + if job_info is not None: + title_tag = job_info.find("h3", class_="base-search-card__title") + title = title_tag.text.strip() if title_tag else "N/A" + + company_tag = job_info.find("a", class_="hidden-nested-link") + company = company_tag.text.strip() if company_tag else "N/A" + + metadata_card = job_info.find( + "div", class_="base-search-card__metadata" + ) + location: Location = LinkedInScraper.get_location(metadata_card) + + datetime_tag = metadata_card.find( + "time", class_="job-search-card__listdate" + ) + if datetime_tag: + datetime_str = datetime_tag["datetime"] + date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") + + job_post = JobPost( + title=title, + company_name=company, + location=location, + date_posted=date_posted, + delivery=Delivery(method=DeliveryEnum.URL, value=job_url), + ) + job_list.append(job_post) + + job_count_text = soup.find( + "span", class_="results-context-header__job-count" + ).text + job_count = int("".join(filter(str.isdigit, job_count_text))) + total_pages = ceil(job_count / 25) + job_response = JobResponse( + jobs=job_list, + job_count=job_count, + page=scraper_input.page, + total_pages=total_pages, + ) + return job_response + + @staticmethod + def get_location(metadata_card): + if metadata_card is not None: + location_tag = metadata_card.find( + "span", class_="job-search-card__location" + ) + location_string = location_tag.text.strip() if location_tag else "N/A" + parts = location_string.split(", ") + if len(parts) == 2: + city, state = parts + location = Location( + country="US", + city=city, + state=state, + ) + + return location diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 5f674ee..65634eb 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -4,13 +4,14 @@ from urllib.parse import urlparse, parse_qs import tls_client from bs4 import BeautifulSoup -from .. import Scraper, ScraperInput, Site -from ...jobs import * +from api.core.scrapers import Scraper, ScraperInput, Site +from api.core.jobs import * +from api.core.utils import handle_response class ZipRecruiterScraper(Scraper): def __init__(self): - site = Site(Site.INDEED) + site = Site(Site.ZIP_RECRUITER) super().__init__(site) self.url = "https://www.ziprecruiter.com/jobs-search" @@ -23,14 +24,15 @@ class ZipRecruiterScraper(Scraper): params = { "search": scraper_input.search_term, "location": scraper_input.location, - "page": min(scraper_input.page if scraper_input.page else 1, 10), + "page": min(scraper_input.page, 10), } response = session.get( self.url, headers=ZipRecruiterScraper.headers(), params=params ) - if response.status_code != 200: - return {"message": f"Error - Status Code: {response.status_code}"} + success, result = handle_response(response) + if not success: + return result html_string = response.content soup = BeautifulSoup(html_string, "html.parser") @@ -57,7 +59,6 @@ class ZipRecruiterScraper(Scraper): title=title, description=description, company_name=company, - industry=None, location=ZipRecruiterScraper.get_location(job), job_type=job_type, compensation=ZipRecruiterScraper.get_compensation(job), diff --git a/api/core/utils.py b/api/core/utils.py new file mode 100644 index 0000000..f4a3ec2 --- /dev/null +++ b/api/core/utils.py @@ -0,0 +1,20 @@ +def handle_response(response): + if response.status_code == 200: + try: + return True, response.json() + except ValueError: + return True, response.text + + try: + error_msg = response.json().get("message", "No detailed message provided.") + except ValueError: + error_msg = "No detailed message provided." + + error = { + "message": "An error occurred during the request.", + "status_code": response.status_code, + "url": response.url, + "details": error_msg, + } + + return False, error diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py index 022cd11..44aa6c7 100644 --- a/api/v1/jobs/__init__.py +++ b/api/v1/jobs/__init__.py @@ -2,8 +2,8 @@ from fastapi import APIRouter from api.core.scrapers.indeed import IndeedScraper from api.core.scrapers.ziprecruiter import ZipRecruiterScraper +from api.core.scrapers.linkedin import LinkedInScraper from api.core.scrapers import ScraperInput -from api.core.jobs import JobResponse router = APIRouter(prefix="/jobs") @@ -13,17 +13,19 @@ async def scrape_jobs( site_type: str, search_term: str, location: str, page: int = None ): job_response = {"message": "site type not found"} - if site_type == "indeed": - indeed_scraper = IndeedScraper() + + scraper_dict = { + "indeed": IndeedScraper, + "linkedin": LinkedInScraper, + "zip": ZipRecruiterScraper, + } + + scraper_class = scraper_dict.get(site_type) + if scraper_class: + scraper = scraper_class() scraper_input = ScraperInput( search_term=search_term, location=location, page=page ) - job_response = indeed_scraper.scrape(scraper_input) - elif site_type == "zip": - ziprecruiter_scraper = ZipRecruiterScraper() - scraper_input = ScraperInput( - search_term=search_term, location=location, page=page - ) - job_response = ziprecruiter_scraper.scrape(scraper_input) + job_response = scraper.scrape(scraper_input) return job_response diff --git a/main.py b/main.py index c870be2..2028fb9 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ from fastapi import FastAPI + from api import router as api_router app = FastAPI() @@ -7,4 +8,4 @@ app.include_router(api_router) @app.get("/") async def root(): - return {"message": "Hello World"} + return {"message": "JobSpy Backend"} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dddfd55 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +fastapi~=0.99.1 +pydantic~=1.10.11 +beautifulsoup4~=4.12.2 +requests~=2.31.0 +pip~=21.3.1 +wheel~=0.37.1 +setuptools~=60.2.0 \ No newline at end of file