restructure project

removed jobspy folder
pull/231/head
Yariv Menachem 2025-01-06 15:10:03 +02:00
parent fced92f871
commit 2be3ebcb78
53 changed files with 144 additions and 147 deletions

View File

@ -3,15 +3,15 @@ requires = [ "poetry-core",]
build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "python-jobspy"
name = "python-JobSeekerTG"
version = "1.1.76"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = [ "Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>",]
homepage = "https://github.com/Bunsly/JobSpy"
authors = [ "YM "]
homepage = "https://github.com/yariv245/JobSeekerTG"
readme = "README.md"
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",]
[[tool.poetry.packages]]
include = "jobspy"
include = "JobSeekerTG"
from = "src"
[tool.black]

View File

@ -1,51 +0,0 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from .site import Site
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat,
)
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None
location: str | None = None
locations: list[str] | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15
hours_old: int | None = None
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -1,10 +1,9 @@
from telegram import Update, ReplyKeyboardMarkup, ReplyKeyboardRemove
from telegram.ext import Application, CommandHandler, ConversationHandler, \
MessageHandler, filters, ContextTypes, CallbackQueryHandler
from telegram import Update
from telegram.ext import Application, CommandHandler, CallbackQueryHandler
from config.settings import settings
from jobspy import Site
from jobspy.scrapers.utils import create_logger
from scrapers import Site
from scrapers.utils import create_logger
from telegram_handler import TelegramDefaultHandler
from telegram_handler.button_callback.telegram_callback_handler import TelegramCallHandler
from telegram_handler.telegram_myinfo_handler import my_info_handler

View File

@ -3,8 +3,8 @@ from typing import Optional
from dotenv import load_dotenv
from pymongo import UpdateOne
from jobspy import create_logger
from jobspy.jobs import JobPost
from scrapers import create_logger
from jobs import JobPost
from .monogo_db import mongo_client
load_dotenv()

View File

@ -2,7 +2,7 @@ from pymongo import MongoClient
from pymongo.synchronous.database import Database
from config.settings import settings
from jobspy import create_logger
from scrapers.utils import create_logger
class MongoDB:

View File

@ -4,7 +4,7 @@ from dotenv import load_dotenv
from pymongo import UpdateOne
from config.cache_manager import cache_manager
from jobspy import create_logger
from scrapers.utils import create_logger
from .User import User
from .monogo_db import mongo_client

View File

@ -1,31 +1,29 @@
from __future__ import annotations
import re
from threading import Lock
from asyncio import Lock, as_completed
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from .scrapers.site import Site
from .scrapers.goozali import GoozaliScraper
from .jobs import JobPost, JobType, Location
from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
ZipRecruiterException,
GlassdoorException,
GoogleJobsException,
from jobs import (
Enum,
JobType,
JobResponse,
Country,
JobPost,
)
from .glassdoor import GlassdoorScraper
from .google import GoogleJobsScraper
from .goozali import GoozaliScraper
from .indeed import IndeedScraper
from .linkedin import LinkedInScraper
from .site import Site
from .utils import set_logger_level, create_logger
from .ziprecruiter import ZipRecruiterScraper
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
def scrape_jobs(
@ -55,7 +53,7 @@ def scrape_jobs(
) -> (list[JobPost], list[JobPost]):
"""
Simultaneously scrapes job data from multiple job sites.
:return: pandas dataframe containing job data
:return: list of jobPost, list of new jobPost
"""
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
@ -111,7 +109,7 @@ def scrape_jobs(
hours_old=hours_old
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
def scrape_site(site: Site) -> tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
scraped_data: JobResponse = scraper.scrape(scraper_input)

View File

@ -1,5 +1,5 @@
"""
jobspy.scrapers.exceptions
scrapers.exceptions
~~~~~~~~~~~~~~~~~~~
This module contains the set of Scrapers' exceptions.

View File

@ -1,5 +1,5 @@
"""
jobspy.scrapers.glassdoor
scrapers.glassdoor
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Glassdoor.
@ -7,7 +7,6 @@ This module contains routines to scrape Glassdoor.
from __future__ import annotations
from dataclasses import dataclass
import re
import json
import requests
@ -18,14 +17,16 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from .GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type
from .constants import fallback_token, query_template, headers
from .. import Scraper, ScraperInput, Site
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import extract_emails_from_text, create_logger
from ..exceptions import GlassdoorException
from ..utils import (
create_session,
markdown_converter,
)
from ...jobs import (
from jobs import (
JobPost,
Compensation,
CompensationInterval,

View File

@ -1,5 +1,5 @@
"""
jobspy.scrapers.google
scrapers.google
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Google.
@ -14,12 +14,14 @@ from typing import Tuple
from datetime import datetime, timedelta
from .constants import headers_jobs, headers_initial, async_param
from .. import Scraper, ScraperInput, Site
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import extract_emails_from_text, create_logger, extract_job_type
from ..utils import (
create_session,
)
from ...jobs import (
from jobs import (
JobPost,
JobResponse,
Location,

View File

@ -1,7 +1,7 @@
from datetime import datetime
import json
from jobspy.jobs import JobPost, Location
from jobs import JobPost, Location
from .model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliResponseData
from .constants import job_post_column_to_goozali_column, job_post_column_names

View File

@ -1,5 +1,5 @@
"""
jobspy.scrapers.Goozali
scrapers.Goozali
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Goozali.
@ -9,15 +9,16 @@ from __future__ import annotations
from model.User import User
from model.user_repository import user_repository
from .. import Scraper, ScraperInput
from .GoozaliMapper import GoozaliMapper
from .GoozaliScrapperComponent import GoozaliScrapperComponent
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map
from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, GoozaliFullRequest
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import create_dict_by_key_and_value, create_session, create_logger
from ...jobs import (
from jobs import (
JobPost,
JobResponse,
)

View File

@ -1,5 +1,5 @@
"""
jobspy.scrapers.indeed
scrapers.indeed
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Indeed.
@ -12,7 +12,9 @@ from typing import Tuple
from datetime import datetime
from .constants import job_search_query, api_headers
from .. import Scraper, ScraperInput, Site
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import (
extract_emails_from_text,
get_enum_from_job_type,
@ -20,7 +22,7 @@ from ..utils import (
create_session,
create_logger,
)
from ...jobs import (
from jobs import (
JobPost,
Compensation,
CompensationInterval,
@ -35,7 +37,7 @@ logger = create_logger("Indeed")
class IndeedScraper(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes IndeedScraper with the Indeed API url
@ -74,7 +76,7 @@ class IndeedScraper(Scraper):
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
f"search page: {
page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor, location)
if not jobs:
@ -85,9 +87,9 @@ class IndeedScraper(Scraper):
return JobResponse(
jobs=job_list[
scraper_input.offset: scraper_input.offset
+ scraper_input.results_wanted
]
scraper_input.offset: scraper_input.offset
+ scraper_input.results_wanted
]
)
def _scrape_page(self, cursor: str | None, location: str) -> Tuple[list[JobPost], str | None]:
@ -108,7 +110,7 @@ class IndeedScraper(Scraper):
what=(f'what: "{search_term}"' if search_term else ""),
location=(
f'location: {{where: "{location}", radius: {
self.scraper_input.distance}, radiusUnit: MILES}}'
self.scraper_input.distance}, radiusUnit: MILES}}'
if location
else ""
),
@ -130,7 +132,7 @@ class IndeedScraper(Scraper):
if not response.ok:
logger.info(
f"responded with status code: {
response.status_code} (submit GitHub issue if this appears to be a bug)"
response.status_code} (submit GitHub issue if this appears to be a bug)"
)
return jobs, new_cursor
data = response.json()
@ -232,7 +234,7 @@ class IndeedScraper(Scraper):
company_name=job["employer"].get(
"name") if job.get("employer") else None,
company_url=(f"{self.base_url}{
rel_url}" if job["employer"] else None),
rel_url}" if job["employer"] else None),
company_url_direct=(
employer["links"]["corporateWebsite"] if employer else None
),
@ -345,7 +347,7 @@ class IndeedScraper(Scraper):
for keyword in remote_keywords
)
return (
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
)
@staticmethod

View File

@ -1,5 +1,5 @@
"""
jobspy.scrapers.linkedin
scrapers.linkedin
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape LinkedIn.
@ -17,13 +17,15 @@ from datetime import datetime
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote
from requests.exceptions import RetryError, RequestException
from requests.exceptions import RetryError
from urllib3.exceptions import MaxRetryError
from .constants import headers
from .. import Scraper, ScraperInput, Site
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..exceptions import LinkedInException
from ..utils import create_session, remove_attributes, create_logger
from ...jobs import (
from jobs import (
JobPost,
Location,
JobResponse,

17
src/scrapers/scraper.py Normal file
View File

@ -0,0 +1,17 @@
from abc import ABC, abstractmethod
from jobs import JobResponse
from scrapers.site import Site
from scrapers.scraper_input import ScraperInput
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -0,0 +1,25 @@
from pydantic import BaseModel
from jobs import Country, JobType, DescriptionFormat
from scrapers.site import Site
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None
location: str | None = None
locations: list[str] | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15
hours_old: int | None = None

View File

@ -11,11 +11,11 @@ import numpy as np
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType
from jobs import CompensationInterval, JobType
def create_logger(name: str):
logger = logging.getLogger(f"JobSpy:{name}")
logger = logging.getLogger(f"JobSeekerTG:{name}")
logger.propagate = False
if not logger.handlers:
logger.setLevel(logging.INFO)
@ -143,7 +143,7 @@ def set_logger_level(verbose: int = 2):
level = getattr(logging, level_name.upper(), None)
if level is not None:
for logger_name in logging.root.manager.loggerDict:
if logger_name.startswith("JobSpy:"):
if logger_name.startswith("JobSeekerTG:"):
logging.getLogger(logger_name).setLevel(level)
else:
raise ValueError(f"Invalid log level: {level_name}")

View File

@ -1,5 +1,5 @@
"""
jobspy.scrapers.ziprecruiter
scrapers.ziprecruiter
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape ZipRecruiter.
@ -19,7 +19,9 @@ from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from .constants import headers
from .. import Scraper, ScraperInput, Site
from ..site import Site
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..utils import (
extract_emails_from_text,
create_session,
@ -27,7 +29,7 @@ from ..utils import (
remove_attributes,
create_logger,
)
from ...jobs import (
from jobs import (
JobPost,
Compensation,
Location,

View File

@ -1,4 +1,3 @@
import os
from typing import Union
from dotenv import load_dotenv
@ -6,8 +5,8 @@ from telegram import Bot, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.constants import ReactionEmoji
from config.settings import settings
from jobspy.jobs import JobPost
from jobspy.scrapers.utils import create_logger
from jobs import JobPost
from scrapers.utils import create_logger
load_dotenv()

View File

@ -3,7 +3,7 @@ from __future__ import annotations
from telegram import MaybeInaccessibleMessage
from telegram.constants import ReactionEmoji
from jobspy import create_logger
from scrapers import create_logger
from model.job_repository import job_repository
from telegram_handler.button_callback.button_fire_strategy import FireStrategy
from telegram_handler.button_callback.button_job_title_strategy import JobTitleStrategy

View File

@ -1,7 +1,7 @@
from telegram import MaybeInaccessibleMessage
from telegram.constants import ReactionEmoji
from jobspy import create_logger
from scrapers import create_logger
from model.job_repository import job_repository
from telegram_bot import TelegramBot
from telegram_handler.button_callback.button_strategy import ButtonStrategy

View File

@ -1,6 +1,6 @@
from typing import Union
from jobspy import JobPost
from scrapers import JobPost
from telegram_bot import TelegramBot
from telegram_handler.button_callback.button_strategy import ButtonStrategy

View File

@ -3,7 +3,7 @@ from telegram.ext import (
ContextTypes,
)
from jobspy import create_logger
from scrapers import create_logger
from telegram_bot import TelegramBot
from telegram_handler.button_callback.button_callback_context import ButtonCallBackContext

View File

@ -4,8 +4,8 @@ from telegram.ext import (
ContextTypes,
)
from jobspy import Site, scrape_jobs, JobPost
from jobspy.scrapers.utils import create_logger
from scrapers import Site, scrape_jobs, JobPost
from scrapers.utils import create_logger
from model.job_repository import JobRepository
from model.user_repository import user_repository
from telegram_bot import TelegramBot

View File

@ -4,7 +4,7 @@ from telegram.ext import (
ContextTypes,
)
from jobspy.scrapers.utils import create_logger
from scrapers.utils import create_logger
from model.user_repository import user_repository
from telegram_bot import TelegramBot
from telegram_handler.telegram_handler import TelegramHandler

View File

@ -7,7 +7,7 @@ from telegram.ext import (
)
from config.cache_manager import cache_manager
from jobspy.scrapers.utils import create_logger
from scrapers.utils import create_logger
from model.Position import Position
from model.User import User
from model.user_repository import user_repository

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs
from scrapers import scrape_jobs
import pandas as pd

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs
from scrapers import scrape_jobs
import pandas as pd

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs
from scrapers import scrape_jobs
import pandas as pd

View File

@ -1,12 +1,12 @@
import json
import os
from jobspy.jobs import JobPost
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
from jobspy.scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column
from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData
from jobspy.scrapers.utils import create_dict_by_key_and_value
from jobs import JobPost
from scrapers.goozali.GoozaliMapper import GoozaliMapper
from scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
from scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column
from scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData
from scrapers.utils import create_dict_by_key_and_value
# URL Example
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs
from scrapers import scrape_jobs
import pandas as pd

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs
from scrapers import scrape_jobs
import pandas as pd

View File

@ -1,7 +1,7 @@
from datetime import datetime, date
from typing import List
from jobspy import JobPost, Location, Country
from scrapers import JobPost, Location, Country
# Creating some test job posts

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs
from scrapers import scrape_jobs
import pandas as pd