From 2be3ebcb78a1c9c26fd75e34dafb2362092b54d7 Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Mon, 6 Jan 2025 15:10:03 +0200 Subject: [PATCH] restructure project removed jobspy folder --- pyproject.toml | 8 +-- src/{jobspy => }/jobs/__init__.py | 0 src/jobspy/scrapers/__init__.py | 51 ------------------- src/main.py | 9 ++-- src/model/job_repository.py | 4 +- src/model/monogo_db.py | 2 +- src/model/user_repository.py | 2 +- src/{jobspy => scrapers}/__init__.py | 48 +++++++++-------- src/{jobspy => }/scrapers/exceptions.py | 2 +- .../scrapers/glassdoor/GlassDoorLocation.py | 0 .../scrapers/glassdoor/__init__.py | 9 ++-- .../scrapers/glassdoor/constants.py | 0 src/{jobspy => }/scrapers/google/__init__.py | 8 +-- src/{jobspy => }/scrapers/google/constants.py | 0 .../scrapers/goozali/GoozaliMapper.py | 2 +- src/{jobspy => }/scrapers/goozali/__init__.py | 7 +-- .../scrapers/goozali/model/GoozaliColumn.py | 0 .../goozali/model/GoozaliColumnChoice.py | 0 .../goozali/model/GoozaliColumnTypeOptions.py | 0 .../goozali/model/GoozaliFieldChoice.py | 0 .../goozali/model/GoozaliFullRequest.py | 0 .../goozali/model/GoozaliPartRequest.py | 0 .../scrapers/goozali/model/GoozaliRequest.py | 0 .../scrapers/goozali/model/GoozaliResponse.py | 0 .../goozali/model/GoozaliResponseData.py | 0 .../scrapers/goozali/model/GoozaliRow.py | 0 .../scrapers/goozali/model/__init__.py | 0 src/{jobspy => }/scrapers/indeed/__init__.py | 26 +++++----- src/{jobspy => }/scrapers/indeed/constants.py | 0 .../scrapers/linkedin/__init__.py | 10 ++-- .../scrapers/linkedin/constants.py | 0 src/scrapers/scraper.py | 17 +++++++ src/scrapers/scraper_input.py | 25 +++++++++ src/{jobspy => }/scrapers/site.py | 0 src/{jobspy => }/scrapers/utils.py | 6 +-- .../scrapers/ziprecruiter/__init__.py | 8 +-- .../scrapers/ziprecruiter/constants.py | 0 src/telegram_bot.py | 5 +- .../button_callback_context.py | 2 +- .../button_callback/button_fire_strategy.py | 2 +- .../button_job_title_strategy.py | 2 +- .../telegram_callback_handler.py | 2 +- .../telegram_default_handler.py | 4 +- .../telegram_myinfo_handler.py | 2 +- .../telegram_start_handler.py | 2 +- tests/test_all.py | 2 +- tests/test_glassdoor.py | 2 +- tests/test_google.py | 2 +- tests/test_goozali.py | 12 ++--- tests/test_indeed.py | 2 +- tests/test_linkedin.py | 2 +- tests/test_util.py | 2 +- tests/test_ziprecruiter.py | 2 +- 53 files changed, 144 insertions(+), 147 deletions(-) rename src/{jobspy => }/jobs/__init__.py (100%) delete mode 100644 src/jobspy/scrapers/__init__.py rename src/{jobspy => scrapers}/__init__.py (86%) rename src/{jobspy => }/scrapers/exceptions.py (96%) rename src/{jobspy => }/scrapers/glassdoor/GlassDoorLocation.py (100%) rename src/{jobspy => }/scrapers/glassdoor/__init__.py (99%) rename src/{jobspy => }/scrapers/glassdoor/constants.py (100%) rename src/{jobspy => }/scrapers/google/__init__.py (98%) rename src/{jobspy => }/scrapers/google/constants.py (100%) rename src/{jobspy => }/scrapers/goozali/GoozaliMapper.py (99%) rename src/{jobspy => }/scrapers/goozali/__init__.py (97%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliColumn.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliColumnChoice.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliColumnTypeOptions.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliFieldChoice.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliFullRequest.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliPartRequest.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliRequest.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliResponse.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliResponseData.py (100%) rename src/{jobspy => }/scrapers/goozali/model/GoozaliRow.py (100%) rename src/{jobspy => }/scrapers/goozali/model/__init__.py (100%) rename src/{jobspy => }/scrapers/indeed/__init__.py (94%) rename src/{jobspy => }/scrapers/indeed/constants.py (100%) rename src/{jobspy => }/scrapers/linkedin/__init__.py (98%) rename src/{jobspy => }/scrapers/linkedin/constants.py (100%) create mode 100644 src/scrapers/scraper.py create mode 100644 src/scrapers/scraper_input.py rename src/{jobspy => }/scrapers/site.py (100%) rename src/{jobspy => }/scrapers/utils.py (98%) rename src/{jobspy => }/scrapers/ziprecruiter/__init__.py (98%) rename src/{jobspy => }/scrapers/ziprecruiter/constants.py (100%) diff --git a/pyproject.toml b/pyproject.toml index c4275a7..fdb9bec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,15 +3,15 @@ requires = [ "poetry-core",] build-backend = "poetry.core.masonry.api" [tool.poetry] -name = "python-jobspy" +name = "python-JobSeekerTG" version = "1.1.76" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" -authors = [ "Zachary Hampton ", "Cullen Watson ",] -homepage = "https://github.com/Bunsly/JobSpy" +authors = [ "YM "] +homepage = "https://github.com/yariv245/JobSeekerTG" readme = "README.md" keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",] [[tool.poetry.packages]] -include = "jobspy" +include = "JobSeekerTG" from = "src" [tool.black] diff --git a/src/jobspy/jobs/__init__.py b/src/jobs/__init__.py similarity index 100% rename from src/jobspy/jobs/__init__.py rename to src/jobs/__init__.py diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py deleted file mode 100644 index c3f2756..0000000 --- a/src/jobspy/scrapers/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod - -from .site import Site -from ..jobs import ( - Enum, - BaseModel, - JobType, - JobResponse, - Country, - DescriptionFormat, -) - - -class SalarySource(Enum): - DIRECT_DATA = "direct_data" - DESCRIPTION = "description" - - -class ScraperInput(BaseModel): - site_type: list[Site] - search_term: str | None = None - google_search_term: str | None = None - - location: str | None = None - locations: list[str] | None = None - country: Country | None = Country.USA - distance: int | None = None - is_remote: bool = False - job_type: JobType | None = None - easy_apply: bool | None = None - offset: int = 0 - linkedin_fetch_description: bool = False - linkedin_company_ids: list[int] | None = None - description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN - - results_wanted: int = 15 - hours_old: int | None = None - - -class Scraper(ABC): - def __init__( - self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None - ): - self.site = site - self.proxies = proxies - self.ca_cert = ca_cert - - @abstractmethod - def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/src/main.py b/src/main.py index 4f5c782..99a87c1 100644 --- a/src/main.py +++ b/src/main.py @@ -1,10 +1,9 @@ -from telegram import Update, ReplyKeyboardMarkup, ReplyKeyboardRemove -from telegram.ext import Application, CommandHandler, ConversationHandler, \ - MessageHandler, filters, ContextTypes, CallbackQueryHandler +from telegram import Update +from telegram.ext import Application, CommandHandler, CallbackQueryHandler from config.settings import settings -from jobspy import Site -from jobspy.scrapers.utils import create_logger +from scrapers import Site +from scrapers.utils import create_logger from telegram_handler import TelegramDefaultHandler from telegram_handler.button_callback.telegram_callback_handler import TelegramCallHandler from telegram_handler.telegram_myinfo_handler import my_info_handler diff --git a/src/model/job_repository.py b/src/model/job_repository.py index 6a8adfa..25b2afe 100644 --- a/src/model/job_repository.py +++ b/src/model/job_repository.py @@ -3,8 +3,8 @@ from typing import Optional from dotenv import load_dotenv from pymongo import UpdateOne -from jobspy import create_logger -from jobspy.jobs import JobPost +from scrapers import create_logger +from jobs import JobPost from .monogo_db import mongo_client load_dotenv() diff --git a/src/model/monogo_db.py b/src/model/monogo_db.py index f15c4c8..c8ab81c 100644 --- a/src/model/monogo_db.py +++ b/src/model/monogo_db.py @@ -2,7 +2,7 @@ from pymongo import MongoClient from pymongo.synchronous.database import Database from config.settings import settings -from jobspy import create_logger +from scrapers.utils import create_logger class MongoDB: diff --git a/src/model/user_repository.py b/src/model/user_repository.py index 9df7edc..608bf32 100644 --- a/src/model/user_repository.py +++ b/src/model/user_repository.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from pymongo import UpdateOne from config.cache_manager import cache_manager -from jobspy import create_logger +from scrapers.utils import create_logger from .User import User from .monogo_db import mongo_client diff --git a/src/jobspy/__init__.py b/src/scrapers/__init__.py similarity index 86% rename from src/jobspy/__init__.py rename to src/scrapers/__init__.py index f176955..c38c8db 100644 --- a/src/jobspy/__init__.py +++ b/src/scrapers/__init__.py @@ -1,31 +1,29 @@ from __future__ import annotations import re -from threading import Lock +from asyncio import Lock, as_completed +from concurrent.futures import ThreadPoolExecutor -import pandas as pd -from typing import Tuple -from concurrent.futures import ThreadPoolExecutor, as_completed - -from .scrapers.site import Site - -from .scrapers.goozali import GoozaliScraper - -from .jobs import JobPost, JobType, Location -from .scrapers.utils import set_logger_level, extract_salary, create_logger -from .scrapers.indeed import IndeedScraper -from .scrapers.ziprecruiter import ZipRecruiterScraper -from .scrapers.glassdoor import GlassdoorScraper -from .scrapers.google import GoogleJobsScraper -from .scrapers.linkedin import LinkedInScraper -from .scrapers import SalarySource, ScraperInput, JobResponse, Country -from .scrapers.exceptions import ( - LinkedInException, - IndeedException, - ZipRecruiterException, - GlassdoorException, - GoogleJobsException, +from jobs import ( + Enum, + JobType, + JobResponse, + Country, + JobPost, ) +from .glassdoor import GlassdoorScraper +from .google import GoogleJobsScraper +from .goozali import GoozaliScraper +from .indeed import IndeedScraper +from .linkedin import LinkedInScraper +from .site import Site +from .utils import set_logger_level, create_logger +from .ziprecruiter import ZipRecruiterScraper + + +class SalarySource(Enum): + DIRECT_DATA = "direct_data" + DESCRIPTION = "description" def scrape_jobs( @@ -55,7 +53,7 @@ def scrape_jobs( ) -> (list[JobPost], list[JobPost]): """ Simultaneously scrapes job data from multiple job sites. - :return: pandas dataframe containing job data + :return: list of jobPost, list of new jobPost """ SCRAPER_MAPPING = { Site.LINKEDIN: LinkedInScraper, @@ -111,7 +109,7 @@ def scrape_jobs( hours_old=hours_old ) - def scrape_site(site: Site) -> Tuple[str, JobResponse]: + def scrape_site(site: Site) -> tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] scraper = scraper_class(proxies=proxies, ca_cert=ca_cert) scraped_data: JobResponse = scraper.scrape(scraper_input) diff --git a/src/jobspy/scrapers/exceptions.py b/src/scrapers/exceptions.py similarity index 96% rename from src/jobspy/scrapers/exceptions.py rename to src/scrapers/exceptions.py index eba0479..dcfb3d2 100644 --- a/src/jobspy/scrapers/exceptions.py +++ b/src/scrapers/exceptions.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.exceptions +scrapers.exceptions ~~~~~~~~~~~~~~~~~~~ This module contains the set of Scrapers' exceptions. diff --git a/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py b/src/scrapers/glassdoor/GlassDoorLocation.py similarity index 100% rename from src/jobspy/scrapers/glassdoor/GlassDoorLocation.py rename to src/scrapers/glassdoor/GlassDoorLocation.py diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/scrapers/glassdoor/__init__.py similarity index 99% rename from src/jobspy/scrapers/glassdoor/__init__.py rename to src/scrapers/glassdoor/__init__.py index 6266501..d0bf582 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/scrapers/glassdoor/__init__.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.glassdoor +scrapers.glassdoor ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape Glassdoor. @@ -7,7 +7,6 @@ This module contains routines to scrape Glassdoor. from __future__ import annotations -from dataclasses import dataclass import re import json import requests @@ -18,14 +17,16 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from .GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type from .constants import fallback_token, query_template, headers -from .. import Scraper, ScraperInput, Site +from ..scraper import Scraper +from ..scraper_input import ScraperInput +from ..site import Site from ..utils import extract_emails_from_text, create_logger from ..exceptions import GlassdoorException from ..utils import ( create_session, markdown_converter, ) -from ...jobs import ( +from jobs import ( JobPost, Compensation, CompensationInterval, diff --git a/src/jobspy/scrapers/glassdoor/constants.py b/src/scrapers/glassdoor/constants.py similarity index 100% rename from src/jobspy/scrapers/glassdoor/constants.py rename to src/scrapers/glassdoor/constants.py diff --git a/src/jobspy/scrapers/google/__init__.py b/src/scrapers/google/__init__.py similarity index 98% rename from src/jobspy/scrapers/google/__init__.py rename to src/scrapers/google/__init__.py index 523e6f5..05cab69 100644 --- a/src/jobspy/scrapers/google/__init__.py +++ b/src/scrapers/google/__init__.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.google +scrapers.google ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape Google. @@ -14,12 +14,14 @@ from typing import Tuple from datetime import datetime, timedelta from .constants import headers_jobs, headers_initial, async_param -from .. import Scraper, ScraperInput, Site +from ..scraper import Scraper +from ..scraper_input import ScraperInput +from ..site import Site from ..utils import extract_emails_from_text, create_logger, extract_job_type from ..utils import ( create_session, ) -from ...jobs import ( +from jobs import ( JobPost, JobResponse, Location, diff --git a/src/jobspy/scrapers/google/constants.py b/src/scrapers/google/constants.py similarity index 100% rename from src/jobspy/scrapers/google/constants.py rename to src/scrapers/google/constants.py diff --git a/src/jobspy/scrapers/goozali/GoozaliMapper.py b/src/scrapers/goozali/GoozaliMapper.py similarity index 99% rename from src/jobspy/scrapers/goozali/GoozaliMapper.py rename to src/scrapers/goozali/GoozaliMapper.py index a68e6ba..0b37e43 100644 --- a/src/jobspy/scrapers/goozali/GoozaliMapper.py +++ b/src/scrapers/goozali/GoozaliMapper.py @@ -1,7 +1,7 @@ from datetime import datetime import json -from jobspy.jobs import JobPost, Location +from jobs import JobPost, Location from .model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliResponseData from .constants import job_post_column_to_goozali_column, job_post_column_names diff --git a/src/jobspy/scrapers/goozali/__init__.py b/src/scrapers/goozali/__init__.py similarity index 97% rename from src/jobspy/scrapers/goozali/__init__.py rename to src/scrapers/goozali/__init__.py index 59e334e..2a3f1b2 100644 --- a/src/jobspy/scrapers/goozali/__init__.py +++ b/src/scrapers/goozali/__init__.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.Goozali +scrapers.Goozali ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape Goozali. @@ -9,15 +9,16 @@ from __future__ import annotations from model.User import User from model.user_repository import user_repository -from .. import Scraper, ScraperInput from .GoozaliMapper import GoozaliMapper from .GoozaliScrapperComponent import GoozaliScrapperComponent from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, GoozaliFullRequest +from ..scraper import Scraper +from ..scraper_input import ScraperInput from ..site import Site from ..utils import create_dict_by_key_and_value, create_session, create_logger -from ...jobs import ( +from jobs import ( JobPost, JobResponse, ) diff --git a/src/jobspy/scrapers/goozali/model/GoozaliColumn.py b/src/scrapers/goozali/model/GoozaliColumn.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliColumn.py rename to src/scrapers/goozali/model/GoozaliColumn.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliColumnChoice.py b/src/scrapers/goozali/model/GoozaliColumnChoice.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliColumnChoice.py rename to src/scrapers/goozali/model/GoozaliColumnChoice.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py b/src/scrapers/goozali/model/GoozaliColumnTypeOptions.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py rename to src/scrapers/goozali/model/GoozaliColumnTypeOptions.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliFieldChoice.py b/src/scrapers/goozali/model/GoozaliFieldChoice.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliFieldChoice.py rename to src/scrapers/goozali/model/GoozaliFieldChoice.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliFullRequest.py b/src/scrapers/goozali/model/GoozaliFullRequest.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliFullRequest.py rename to src/scrapers/goozali/model/GoozaliFullRequest.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliPartRequest.py b/src/scrapers/goozali/model/GoozaliPartRequest.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliPartRequest.py rename to src/scrapers/goozali/model/GoozaliPartRequest.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliRequest.py b/src/scrapers/goozali/model/GoozaliRequest.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliRequest.py rename to src/scrapers/goozali/model/GoozaliRequest.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliResponse.py b/src/scrapers/goozali/model/GoozaliResponse.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliResponse.py rename to src/scrapers/goozali/model/GoozaliResponse.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliResponseData.py b/src/scrapers/goozali/model/GoozaliResponseData.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliResponseData.py rename to src/scrapers/goozali/model/GoozaliResponseData.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliRow.py b/src/scrapers/goozali/model/GoozaliRow.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/GoozaliRow.py rename to src/scrapers/goozali/model/GoozaliRow.py diff --git a/src/jobspy/scrapers/goozali/model/__init__.py b/src/scrapers/goozali/model/__init__.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/__init__.py rename to src/scrapers/goozali/model/__init__.py diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/scrapers/indeed/__init__.py similarity index 94% rename from src/jobspy/scrapers/indeed/__init__.py rename to src/scrapers/indeed/__init__.py index 05ae16c..80ef94b 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/scrapers/indeed/__init__.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.indeed +scrapers.indeed ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape Indeed. @@ -12,7 +12,9 @@ from typing import Tuple from datetime import datetime from .constants import job_search_query, api_headers -from .. import Scraper, ScraperInput, Site +from ..scraper import Scraper +from ..scraper_input import ScraperInput +from ..site import Site from ..utils import ( extract_emails_from_text, get_enum_from_job_type, @@ -20,7 +22,7 @@ from ..utils import ( create_session, create_logger, ) -from ...jobs import ( +from jobs import ( JobPost, Compensation, CompensationInterval, @@ -35,7 +37,7 @@ logger = create_logger("Indeed") class IndeedScraper(Scraper): def __init__( - self, proxies: list[str] | str | None = None, ca_cert: str | None = None + self, proxies: list[str] | str | None = None, ca_cert: str | None = None ): """ Initializes IndeedScraper with the Indeed API url @@ -74,7 +76,7 @@ class IndeedScraper(Scraper): while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset: logger.info( f"search page: { - page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" + page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}" ) jobs, cursor = self._scrape_page(cursor, location) if not jobs: @@ -85,9 +87,9 @@ class IndeedScraper(Scraper): return JobResponse( jobs=job_list[ - scraper_input.offset: scraper_input.offset - + scraper_input.results_wanted - ] + scraper_input.offset: scraper_input.offset + + scraper_input.results_wanted + ] ) def _scrape_page(self, cursor: str | None, location: str) -> Tuple[list[JobPost], str | None]: @@ -108,7 +110,7 @@ class IndeedScraper(Scraper): what=(f'what: "{search_term}"' if search_term else ""), location=( f'location: {{where: "{location}", radius: { - self.scraper_input.distance}, radiusUnit: MILES}}' + self.scraper_input.distance}, radiusUnit: MILES}}' if location else "" ), @@ -130,7 +132,7 @@ class IndeedScraper(Scraper): if not response.ok: logger.info( f"responded with status code: { - response.status_code} (submit GitHub issue if this appears to be a bug)" + response.status_code} (submit GitHub issue if this appears to be a bug)" ) return jobs, new_cursor data = response.json() @@ -232,7 +234,7 @@ class IndeedScraper(Scraper): company_name=job["employer"].get( "name") if job.get("employer") else None, company_url=(f"{self.base_url}{ - rel_url}" if job["employer"] else None), + rel_url}" if job["employer"] else None), company_url_direct=( employer["links"]["corporateWebsite"] if employer else None ), @@ -345,7 +347,7 @@ class IndeedScraper(Scraper): for keyword in remote_keywords ) return ( - is_remote_in_attributes or is_remote_in_description or is_remote_in_location + is_remote_in_attributes or is_remote_in_description or is_remote_in_location ) @staticmethod diff --git a/src/jobspy/scrapers/indeed/constants.py b/src/scrapers/indeed/constants.py similarity index 100% rename from src/jobspy/scrapers/indeed/constants.py rename to src/scrapers/indeed/constants.py diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/scrapers/linkedin/__init__.py similarity index 98% rename from src/jobspy/scrapers/linkedin/__init__.py rename to src/scrapers/linkedin/__init__.py index 4519610..8e04d3f 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/scrapers/linkedin/__init__.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.linkedin +scrapers.linkedin ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape LinkedIn. @@ -17,13 +17,15 @@ from datetime import datetime from bs4.element import Tag from bs4 import BeautifulSoup from urllib.parse import urlparse, urlunparse, unquote -from requests.exceptions import RetryError, RequestException +from requests.exceptions import RetryError from urllib3.exceptions import MaxRetryError from .constants import headers -from .. import Scraper, ScraperInput, Site +from ..scraper import Scraper +from ..scraper_input import ScraperInput +from ..site import Site from ..exceptions import LinkedInException from ..utils import create_session, remove_attributes, create_logger -from ...jobs import ( +from jobs import ( JobPost, Location, JobResponse, diff --git a/src/jobspy/scrapers/linkedin/constants.py b/src/scrapers/linkedin/constants.py similarity index 100% rename from src/jobspy/scrapers/linkedin/constants.py rename to src/scrapers/linkedin/constants.py diff --git a/src/scrapers/scraper.py b/src/scrapers/scraper.py new file mode 100644 index 0000000..c5ba529 --- /dev/null +++ b/src/scrapers/scraper.py @@ -0,0 +1,17 @@ +from abc import ABC, abstractmethod + +from jobs import JobResponse +from scrapers.site import Site +from scrapers.scraper_input import ScraperInput + + +class Scraper(ABC): + def __init__( + self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None + ): + self.site = site + self.proxies = proxies + self.ca_cert = ca_cert + + @abstractmethod + def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... \ No newline at end of file diff --git a/src/scrapers/scraper_input.py b/src/scrapers/scraper_input.py new file mode 100644 index 0000000..9b3a183 --- /dev/null +++ b/src/scrapers/scraper_input.py @@ -0,0 +1,25 @@ +from pydantic import BaseModel + +from jobs import Country, JobType, DescriptionFormat +from scrapers.site import Site + + +class ScraperInput(BaseModel): + site_type: list[Site] + search_term: str | None = None + google_search_term: str | None = None + + location: str | None = None + locations: list[str] | None = None + country: Country | None = Country.USA + distance: int | None = None + is_remote: bool = False + job_type: JobType | None = None + easy_apply: bool | None = None + offset: int = 0 + linkedin_fetch_description: bool = False + linkedin_company_ids: list[int] | None = None + description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN + + results_wanted: int = 15 + hours_old: int | None = None \ No newline at end of file diff --git a/src/jobspy/scrapers/site.py b/src/scrapers/site.py similarity index 100% rename from src/jobspy/scrapers/site.py rename to src/scrapers/site.py diff --git a/src/jobspy/scrapers/utils.py b/src/scrapers/utils.py similarity index 98% rename from src/jobspy/scrapers/utils.py rename to src/scrapers/utils.py index 6947650..ac77352 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/scrapers/utils.py @@ -11,11 +11,11 @@ import numpy as np from markdownify import markdownify as md from requests.adapters import HTTPAdapter, Retry -from ..jobs import CompensationInterval, JobType +from jobs import CompensationInterval, JobType def create_logger(name: str): - logger = logging.getLogger(f"JobSpy:{name}") + logger = logging.getLogger(f"JobSeekerTG:{name}") logger.propagate = False if not logger.handlers: logger.setLevel(logging.INFO) @@ -143,7 +143,7 @@ def set_logger_level(verbose: int = 2): level = getattr(logging, level_name.upper(), None) if level is not None: for logger_name in logging.root.manager.loggerDict: - if logger_name.startswith("JobSpy:"): + if logger_name.startswith("JobSeekerTG:"): logging.getLogger(logger_name).setLevel(level) else: raise ValueError(f"Invalid log level: {level_name}") diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/scrapers/ziprecruiter/__init__.py similarity index 98% rename from src/jobspy/scrapers/ziprecruiter/__init__.py rename to src/scrapers/ziprecruiter/__init__.py index 294ca8c..90dab76 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/scrapers/ziprecruiter/__init__.py @@ -1,5 +1,5 @@ """ -jobspy.scrapers.ziprecruiter +scrapers.ziprecruiter ~~~~~~~~~~~~~~~~~~~ This module contains routines to scrape ZipRecruiter. @@ -19,7 +19,9 @@ from concurrent.futures import ThreadPoolExecutor from bs4 import BeautifulSoup from .constants import headers -from .. import Scraper, ScraperInput, Site +from ..site import Site +from ..scraper import Scraper +from ..scraper_input import ScraperInput from ..utils import ( extract_emails_from_text, create_session, @@ -27,7 +29,7 @@ from ..utils import ( remove_attributes, create_logger, ) -from ...jobs import ( +from jobs import ( JobPost, Compensation, Location, diff --git a/src/jobspy/scrapers/ziprecruiter/constants.py b/src/scrapers/ziprecruiter/constants.py similarity index 100% rename from src/jobspy/scrapers/ziprecruiter/constants.py rename to src/scrapers/ziprecruiter/constants.py diff --git a/src/telegram_bot.py b/src/telegram_bot.py index 1bfaff9..6f7c1d7 100644 --- a/src/telegram_bot.py +++ b/src/telegram_bot.py @@ -1,4 +1,3 @@ -import os from typing import Union from dotenv import load_dotenv @@ -6,8 +5,8 @@ from telegram import Bot, InlineKeyboardButton, InlineKeyboardMarkup from telegram.constants import ReactionEmoji from config.settings import settings -from jobspy.jobs import JobPost -from jobspy.scrapers.utils import create_logger +from jobs import JobPost +from scrapers.utils import create_logger load_dotenv() diff --git a/src/telegram_handler/button_callback/button_callback_context.py b/src/telegram_handler/button_callback/button_callback_context.py index 5d06367..52c5fe9 100644 --- a/src/telegram_handler/button_callback/button_callback_context.py +++ b/src/telegram_handler/button_callback/button_callback_context.py @@ -3,7 +3,7 @@ from __future__ import annotations from telegram import MaybeInaccessibleMessage from telegram.constants import ReactionEmoji -from jobspy import create_logger +from scrapers import create_logger from model.job_repository import job_repository from telegram_handler.button_callback.button_fire_strategy import FireStrategy from telegram_handler.button_callback.button_job_title_strategy import JobTitleStrategy diff --git a/src/telegram_handler/button_callback/button_fire_strategy.py b/src/telegram_handler/button_callback/button_fire_strategy.py index 90f6050..44af48b 100644 --- a/src/telegram_handler/button_callback/button_fire_strategy.py +++ b/src/telegram_handler/button_callback/button_fire_strategy.py @@ -1,7 +1,7 @@ from telegram import MaybeInaccessibleMessage from telegram.constants import ReactionEmoji -from jobspy import create_logger +from scrapers import create_logger from model.job_repository import job_repository from telegram_bot import TelegramBot from telegram_handler.button_callback.button_strategy import ButtonStrategy diff --git a/src/telegram_handler/button_callback/button_job_title_strategy.py b/src/telegram_handler/button_callback/button_job_title_strategy.py index a96bbf7..bec2535 100644 --- a/src/telegram_handler/button_callback/button_job_title_strategy.py +++ b/src/telegram_handler/button_callback/button_job_title_strategy.py @@ -1,6 +1,6 @@ from typing import Union -from jobspy import JobPost +from scrapers import JobPost from telegram_bot import TelegramBot from telegram_handler.button_callback.button_strategy import ButtonStrategy diff --git a/src/telegram_handler/button_callback/telegram_callback_handler.py b/src/telegram_handler/button_callback/telegram_callback_handler.py index b43f4cc..051bd8b 100644 --- a/src/telegram_handler/button_callback/telegram_callback_handler.py +++ b/src/telegram_handler/button_callback/telegram_callback_handler.py @@ -3,7 +3,7 @@ from telegram.ext import ( ContextTypes, ) -from jobspy import create_logger +from scrapers import create_logger from telegram_bot import TelegramBot from telegram_handler.button_callback.button_callback_context import ButtonCallBackContext diff --git a/src/telegram_handler/telegram_default_handler.py b/src/telegram_handler/telegram_default_handler.py index 6bb80cc..0266d4f 100644 --- a/src/telegram_handler/telegram_default_handler.py +++ b/src/telegram_handler/telegram_default_handler.py @@ -4,8 +4,8 @@ from telegram.ext import ( ContextTypes, ) -from jobspy import Site, scrape_jobs, JobPost -from jobspy.scrapers.utils import create_logger +from scrapers import Site, scrape_jobs, JobPost +from scrapers.utils import create_logger from model.job_repository import JobRepository from model.user_repository import user_repository from telegram_bot import TelegramBot diff --git a/src/telegram_handler/telegram_myinfo_handler.py b/src/telegram_handler/telegram_myinfo_handler.py index 853a707..4097833 100644 --- a/src/telegram_handler/telegram_myinfo_handler.py +++ b/src/telegram_handler/telegram_myinfo_handler.py @@ -4,7 +4,7 @@ from telegram.ext import ( ContextTypes, ) -from jobspy.scrapers.utils import create_logger +from scrapers.utils import create_logger from model.user_repository import user_repository from telegram_bot import TelegramBot from telegram_handler.telegram_handler import TelegramHandler diff --git a/src/telegram_handler/telegram_start_handler.py b/src/telegram_handler/telegram_start_handler.py index 68058c9..d30c643 100644 --- a/src/telegram_handler/telegram_start_handler.py +++ b/src/telegram_handler/telegram_start_handler.py @@ -7,7 +7,7 @@ from telegram.ext import ( ) from config.cache_manager import cache_manager -from jobspy.scrapers.utils import create_logger +from scrapers.utils import create_logger from model.Position import Position from model.User import User from model.user_repository import user_repository diff --git a/tests/test_all.py b/tests/test_all.py index 3285611..6a6ff60 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -1,4 +1,4 @@ -from jobspy import scrape_jobs +from scrapers import scrape_jobs import pandas as pd diff --git a/tests/test_glassdoor.py b/tests/test_glassdoor.py index 267a3e6..16676ba 100644 --- a/tests/test_glassdoor.py +++ b/tests/test_glassdoor.py @@ -1,4 +1,4 @@ -from jobspy import scrape_jobs +from scrapers import scrape_jobs import pandas as pd diff --git a/tests/test_google.py b/tests/test_google.py index 9f30ffe..5fa10f3 100644 --- a/tests/test_google.py +++ b/tests/test_google.py @@ -1,4 +1,4 @@ -from jobspy import scrape_jobs +from scrapers import scrape_jobs import pandas as pd diff --git a/tests/test_goozali.py b/tests/test_goozali.py index 2f59956..9f68ab5 100644 --- a/tests/test_goozali.py +++ b/tests/test_goozali.py @@ -1,12 +1,12 @@ import json import os -from jobspy.jobs import JobPost -from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper -from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent -from jobspy.scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column -from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData -from jobspy.scrapers.utils import create_dict_by_key_and_value +from jobs import JobPost +from scrapers.goozali.GoozaliMapper import GoozaliMapper +from scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent +from scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column +from scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData +from scrapers.utils import create_dict_by_key_and_value # URL Example # https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D diff --git a/tests/test_indeed.py b/tests/test_indeed.py index 714fc53..0468afb 100644 --- a/tests/test_indeed.py +++ b/tests/test_indeed.py @@ -1,4 +1,4 @@ -from jobspy import scrape_jobs +from scrapers import scrape_jobs import pandas as pd diff --git a/tests/test_linkedin.py b/tests/test_linkedin.py index 0cb5ec4..29d0bf8 100644 --- a/tests/test_linkedin.py +++ b/tests/test_linkedin.py @@ -1,4 +1,4 @@ -from jobspy import scrape_jobs +from scrapers import scrape_jobs import pandas as pd diff --git a/tests/test_util.py b/tests/test_util.py index 5ad8751..bfc3f8c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,7 +1,7 @@ from datetime import datetime, date from typing import List -from jobspy import JobPost, Location, Country +from scrapers import JobPost, Location, Country # Creating some test job posts diff --git a/tests/test_ziprecruiter.py b/tests/test_ziprecruiter.py index 61de491..a023590 100644 --- a/tests/test_ziprecruiter.py +++ b/tests/test_ziprecruiter.py @@ -1,4 +1,4 @@ -from jobspy import scrape_jobs +from scrapers import scrape_jobs import pandas as pd