Merge pull request #3 from yariv245/start_conv_handler

start_conv_handler
pull/231/head
Yariv Menachem 2025-01-06 16:25:26 +02:00 committed by GitHub
commit 69a420710e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
65 changed files with 849 additions and 275 deletions

View File

@ -3,15 +3,15 @@ requires = [ "poetry-core",]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-JobSeekerTG"
version = "1.1.76" version = "1.1.76"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = [ "Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>",] authors = [ "YM "]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/yariv245/JobSeekerTG"
readme = "README.md" readme = "README.md"
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",] keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",]
[[tool.poetry.packages]] [[tool.poetry.packages]]
include = "jobspy" include = "JobSeekerTG"
from = "src" from = "src"
[tool.black] [tool.black]

Binary file not shown.

View File

@ -0,0 +1,17 @@
from cachebox import LRUCache
class CacheboxCacheManager:
def __init__(self):
self._cache = LRUCache(50)
def find(self, cache_id: str):
"""Finding cached data by id, else None"""
return self._cache.get(cache_id)
def save(self, cache_id: str, data):
"""Finding cached data by id, else None"""
self._cache.insert(cache_id, data)
cache_manager = CacheboxCacheManager()

View File

@ -1,34 +0,0 @@
import os
from pymongo import MongoClient
from pymongo.synchronous.database import Database
from config.settings import settings
from jobspy import create_logger
class MongoDB:
_instance = None
db:Database = None
def __new__(cls):
if cls._instance is not None:
return cls._instance
self = super().__new__(cls)
cls._instance = self
logger = create_logger("Mongo Client")
mongoUri = settings.mongo_uri
if not mongoUri:
logger.error("MONGO_URI environment variable is not set")
raise ValueError("MONGO_URI environment variable is not set")
client = MongoClient(mongoUri)
database_name = settings.mongo_db_name
if not database_name:
logger.error("MONGO_DB_NAME environment variable is not set")
raise ValueError(
"MONGO_DB_NAME environment variable is not set")
self.db = client[database_name]
logger.info("Succeed connect to MongoDB")
return cls._instance

View File

@ -1,29 +0,0 @@
from .model import GoozaliColumn
job_post_column_to_goozali_column = {
"date_posted": "Discovered",
"field": "Field",
"title": "Job Title",
"job_url": "Position Link",
"company_name": "Company",
"description": "Requirements",
"location": "Location",
"company_industry": "Company Industry",
"id": "Job ID"
}
job_post_column_names = ["id",
"date_posted",
"field",
"title",
"job_url",
"company_name",
"description",
"location",
"company_industry"]
# Key mapper: Extract 'name' as the key
def extract_goozali_column_name(column): return column.name if isinstance(
column, GoozaliColumn) else None

View File

@ -1,13 +1,13 @@
import os
from telegram import Update from telegram import Update
from telegram.ext import Application, CommandHandler, CallbackQueryHandler, Updater from telegram.ext import Application, CommandHandler, CallbackQueryHandler
from config.settings import settings from config.settings import settings
from jobspy.scrapers.site import Site from scrapers import Site
from jobspy.scrapers.utils import create_logger from scrapers.utils import create_logger
from telegram_handler import TelegramDefaultHandler from telegram_handler import TelegramDefaultHandler
from telegram_handler.button_callback.telegram_callback_handler import TelegramCallHandler from telegram_handler.button_callback.telegram_callback_handler import TelegramCallHandler
from telegram_handler.telegram_myinfo_handler import my_info_handler
from telegram_handler.telegram_start_handler import start_conv_handler
logger = create_logger("Main") logger = create_logger("Main")
_api_token = settings.telegram_api_token _api_token = settings.telegram_api_token
@ -17,52 +17,34 @@ title_filters: list[str] = ["test", "qa", "Lead", "Full-Stack", "Full Stack", "F
"automation", "BI ", "Principal", "Architect", "Android", "Machine Learning", "Student", "automation", "BI ", "Principal", "Architect", "Android", "Machine Learning", "Student",
"Data Engineer", "DevSecOps"] "Data Engineer", "DevSecOps"]
async def stop(update, context):
logger.info("Stop polling from telegram")
application.stop_running()
if __name__ == "__main__": if __name__ == "__main__":
logger.info("Starting initialize ") logger.info("Starting initialize ")
search_term = "software engineer" search_term = "software engineer"
locations = ["Tel Aviv, Israel", "Ramat Gan, Israel", locations = ["Tel Aviv, Israel", "Ramat Gan, Israel",
"Central, Israel", "Rehovot ,Israel"] "Central, Israel", "Rehovot ,Israel"]
application.add_handler(start_conv_handler)
tg_callback_handler = TelegramCallHandler() tg_callback_handler = TelegramCallHandler()
tg_handler_all = TelegramDefaultHandler(sites=[Site.LINKEDIN, Site.GLASSDOOR, Site.INDEED, Site.GOOZALI], tg_handler_all = TelegramDefaultHandler(sites=[Site.LINKEDIN, Site.GLASSDOOR, Site.INDEED, Site.GOOZALI])
locations=locations,
title_filters=title_filters,
search_term=search_term)
application.add_handler(CommandHandler("find", tg_handler_all.handle)) application.add_handler(CommandHandler("find", tg_handler_all.handle))
# Goozali # Goozali
tg_handler_goozali = TelegramDefaultHandler(sites=[Site.GOOZALI], tg_handler_goozali = TelegramDefaultHandler(sites=[Site.GOOZALI])
locations=locations,
title_filters=title_filters,
search_term=search_term)
application.add_handler(CommandHandler( application.add_handler(CommandHandler(
Site.GOOZALI.value, tg_handler_goozali.handle)) Site.GOOZALI.value, tg_handler_goozali.handle))
# GlassDoor # GlassDoor
tg_handler_glassdoor = TelegramDefaultHandler(sites=[Site.GLASSDOOR], tg_handler_glassdoor = TelegramDefaultHandler(sites=[Site.GLASSDOOR])
locations=locations,
title_filters=title_filters,
search_term=search_term)
application.add_handler(CommandHandler( application.add_handler(CommandHandler(
Site.GLASSDOOR.value, tg_handler_glassdoor.handle)) Site.GLASSDOOR.value, tg_handler_glassdoor.handle))
# LinkeDin # LinkeDin
tg_handler_linkedin = TelegramDefaultHandler(sites=[Site.LINKEDIN], tg_handler_linkedin = TelegramDefaultHandler(sites=[Site.LINKEDIN])
locations=locations,
title_filters=title_filters,
search_term=search_term)
application.add_handler(CommandHandler( application.add_handler(CommandHandler(
Site.LINKEDIN.value, tg_handler_linkedin.handle)) Site.LINKEDIN.value, tg_handler_linkedin.handle))
# Indeed # Indeed
tg_handler_indeed = TelegramDefaultHandler(sites=[Site.INDEED], tg_handler_indeed = TelegramDefaultHandler(sites=[Site.INDEED])
locations=locations,
title_filters=title_filters,
search_term=search_term)
application.add_handler(CommandHandler( application.add_handler(CommandHandler(
Site.INDEED.value, tg_handler_indeed.handle)) Site.INDEED.value, tg_handler_indeed.handle))
application.add_handler(CommandHandler(
"myInfo", my_info_handler.handle))
application.add_handler(CallbackQueryHandler( application.add_handler(CallbackQueryHandler(
tg_callback_handler.button_callback)) tg_callback_handler.button_callback))
application.add_handler(CommandHandler('stop', stop))
logger.info("Run polling from telegram") logger.info("Run polling from telegram")
application.run_polling(allowed_updates=Update.ALL_TYPES) application.run_polling(allowed_updates=Update.ALL_TYPES)

26
src/model/Position.py Normal file
View File

@ -0,0 +1,26 @@
from enum import Enum
class Position(str, Enum):
BACKEND_DEVELOPER = "Backend Developer"
FULLSTACK_DEVELOPER = "Fullstack Developer"
FRONTEND_DEVELOPER = "Frontend Developer"
DATA_SCIENTIST = "Data Scientist"
DATA_ANALYST = "Data Analyst"
PROJECT_MANAGER = "Project Manager"
CLOUD_ENGINEER = "Cloud Engineer"
CLOUD_ARCHITECT = "Cloud Architect"
UX_UI_DESIGNER = "UX/UI Designer"
PRODUCT_MANAGER = "Product Manager"
DEV_OPS_ENGINEER = "DevOps Engineer"
BUSINESS_ANALYST = "Business Analyst"
CYBERSECURITY_ENGINEER = "Cybersecurity Engineer"
MACHINE_LEARNING_ENGINEER = "Machine Learning Engineer"
ARTIFICIAL_INTELLIGENCE_ENGINEER = "Artificial Intelligence Engineer"
DATABASE_ADMINISTRATOR = "Database Administrator"
SYSTEMS_ADMINISTRATOR = "Systems Administrator"
NETWORK_ENGINEER = "Network Engineer"
TECHNICAL_SUPPORT_SPECIALIST = "Technical Support Specialist"
SALES_ENGINEER = "Sales Engineer"
SCRUM_MASTER = "Scrum Master"
IT_MANAGER = "IT Manager"

34
src/model/User.py Normal file
View File

@ -0,0 +1,34 @@
from typing import Optional, Union
from pydantic import BaseModel, Field
from model.Position import Position
class User(BaseModel):
full_name: str
username: str
chat_id: Union[int, str] = None
experience: Union[int, str] = None
job_age: Union[int, str] = None
position: Optional[Position] = None
cities: Optional[list[str]] = None
title_filters: Optional[list[str]] = None
def get_myinfo_message(self):
message = "Here's your profile:\n\n"
message += f"Full Name: {self.full_name}\n"
message += f"Username: @{self.username}\n"
if self.chat_id:
message += f"Chat ID: {self.chat_id}\n"
if self.job_age:
message += f"Job Age (Hours): {self.experience}\n"
if self.experience:
message += f"Experience(Years): {self.experience}\n"
if self.position:
message += f"Position Level: {self.position.value}\n"
if self.cities:
message += f"Preferred Cities: {', '.join(self.cities)}\n"
if self.title_filters:
message += f"Job Title Filters: {', '.join(self.title_filters)}\n"
return message

View File

View File

@ -0,0 +1,17 @@
from bson.codec_options import TypeCodec
from model.Position import Position
class PositionCodec(TypeCodec):
python_type = Position
bson_type = str
def transform_python(self, value):
return value.name
def transform_bson(self, value):
return Position(value)
# position_codec = PositionCodec()

View File

@ -3,27 +3,17 @@ from typing import Optional
from dotenv import load_dotenv from dotenv import load_dotenv
from pymongo import UpdateOne from pymongo import UpdateOne
from .monogo_db import MongoDB from scrapers import create_logger
from jobspy import create_logger from jobs import JobPost
from jobspy.jobs import JobPost from .monogo_db import mongo_client
load_dotenv() load_dotenv()
class JobRepository: class JobRepository:
_instance = None def __init__(self):
self._logger = create_logger("JobRepository")
def __new__(cls): self._collection = mongo_client.get_collection('jobs')
if cls._instance is not None:
return cls._instance
self = super().__new__(cls)
cls._instance = self
self.logger = create_logger("JobRepository")
mongo_client = MongoDB()
self.collection = mongo_client.db["jobs"]
return cls._instance
def find_by_id(self, job_id: str) -> Optional[JobPost]: def find_by_id(self, job_id: str) -> Optional[JobPost]:
""" """
@ -35,7 +25,7 @@ class JobRepository:
Returns: Returns:
The job document if found, otherwise None. The job document if found, otherwise None.
""" """
result = self.collection.find_one({"id": job_id}) result = self._collection.find_one({"id": job_id})
return JobPost(**result) return JobPost(**result)
def update(self, job: JobPost) -> bool: def update(self, job: JobPost) -> bool:
@ -48,7 +38,7 @@ class JobRepository:
Returns: Returns:
True if the update was successful, False otherwise. True if the update was successful, False otherwise.
""" """
result = self.collection.update_one({"id": job.id}, {"$set": job.model_dump(exclude={"date_posted"})}) result = self._collection.update_one({"id": job.id}, {"$set": job.model_dump(exclude={"date_posted"})})
return result.modified_count > 0 return result.modified_count > 0
def insert_job(self, job: JobPost): def insert_job(self, job: JobPost):
@ -62,8 +52,8 @@ class JobRepository:
Exception: If an error occurs during insertion. Exception: If an error occurs during insertion.
""" """
job_dict = job.model_dump(exclude={"date_posted"}) job_dict = job.model_dump(exclude={"date_posted"})
self.collection.insert_one(job_dict) self._collection.insert_one(job_dict)
self.logger.info(f"Inserted new job with title {job.title}.") self._logger.info(f"Inserted new job with title {job.title}.")
def insert_many_if_not_found(self, jobs: list[JobPost]) -> tuple[list[JobPost], list[JobPost]]: def insert_many_if_not_found(self, jobs: list[JobPost]) -> tuple[list[JobPost], list[JobPost]]:
""" """
@ -86,8 +76,8 @@ class JobRepository:
if operations: if operations:
# Execute all operations in bulk # Execute all operations in bulk
result = self.collection.bulk_write(operations) result = self._collection.bulk_write(operations)
self.logger.info(f"Matched: {result.matched_count}, Upserts: { self._logger.info(f"Matched: {result.matched_count}, Upserts: {
result.upserted_count}, Modified: {result.modified_count}") result.upserted_count}, Modified: {result.modified_count}")
# Get the newly inserted jobs (those that were upserted) # Get the newly inserted jobs (those that were upserted)
@ -99,3 +89,5 @@ class JobRepository:
old_jobs.append(job) old_jobs.append(job)
return old_jobs, new_jobs return old_jobs, new_jobs
job_repository = JobRepository()

38
src/model/monogo_db.py Normal file
View File

@ -0,0 +1,38 @@
from pymongo import MongoClient
from pymongo.synchronous.database import Database
from config.settings import settings
from scrapers.utils import create_logger
class MongoDB:
def __init__(self):
logger = create_logger("Mongo Client")
mongo_uri = settings.mongo_uri
if not mongo_uri:
logger.error("MONGO_URI environment variable is not set")
raise ValueError("MONGO_URI environment variable is not set")
client = MongoClient(mongo_uri)
database_name = settings.mongo_db_name
if not database_name:
logger.error("MONGO_DB_NAME environment variable is not set")
raise ValueError(
"MONGO_DB_NAME environment variable is not set")
self._db: Database = client[database_name]
logger.info("Succeed connect to MongoDB")
def get_collection(self,
name: str,
codec_options=None,
read_preference=None,
write_concern=None,
read_concern=None):
return self._db.get_collection(name,
codec_options,
read_preference,
write_concern,
read_concern)
mongo_client = MongoDB()

View File

@ -0,0 +1,129 @@
from typing import Optional
from dotenv import load_dotenv
from pymongo import UpdateOne
from config.cache_manager import cache_manager
from scrapers.utils import create_logger
from .User import User
from .monogo_db import mongo_client
load_dotenv()
class UserRepository:
def __init__(self):
self._logger = create_logger("UserRepository")
self._collection = mongo_client.get_collection('user')
self._collection.create_index('username', unique=True)
def find_by_id(self, user_id: str) -> Optional[User]:
"""
Finds a user document in the collection by its ID.
Args:
user_id: The ID of the user to find.
Returns:
The user document if found, otherwise None.
"""
user = None
cached_user = cache_manager.find(user_id)
if cached_user:
return cached_user
result = self._collection.find_one({"id": user_id})
if result:
user = User(**result)
cache_manager.save(user_id, user)
return user
def find_by_username(self, username: str) -> Optional[User]:
"""
Finds a user document in the collection by its username.
Args:
username: The username of the user to find.
Returns:
The user document if found, otherwise None.
"""
user = None
cached_user = cache_manager.find(username)
if cached_user:
return cached_user
result = self._collection.find_one({"username": username})
self._logger.info("find user by usernameeeeeeee")
if result:
user = User(**result)
cache_manager.save(username, user)
return user
def update(self, user: User) -> bool:
"""
Updates a User in the database.
Args:
user: A dictionary representing the User data.
Returns:
True if the update was successful, False otherwise.
"""
result = self._collection.update_one({"username": user.username}, {"$set": user.model_dump()})
return result.modified_count > 0
def insert_user(self, user: User):
"""
Inserts a new user posting into the database collection.
Args:
user (User): The User object to be inserted.
Raises:
Exception: If an error occurs during insertion.
"""
self._collection.insert_one(user.model_dump())
cache_manager.save(user.username, user)
self._logger.info(f"Inserted new user with username {user.username}.")
def insert_many_if_not_found(self, users: list[User]) -> tuple[list[User], list[User]]:
"""
Perform bulk upserts for a list of User objects into a MongoDB collection.
Only insert new users and return the list of newly inserted users.
"""
operations = []
new_users = [] # List to store the new users inserted into MongoDB
old_users = [] # List to store the new users inserted into MongoDB
for user in users:
user_dict = user.model_dump()
operations.append(
UpdateOne(
{"id": user.id}, # Match by `id`
# Only set fields if the user is being inserted (not updated)
{"$setOnInsert": user_dict},
upsert=True # Insert if not found, but do not update if already exists
)
)
if operations:
# Execute all operations in bulk
result = self._collection.bulk_write(operations)
self._logger.info(f"Matched: {result.matched_count}, Upserts: {
result.upserted_count}, Modified: {result.modified_count}")
# Get the newly inserted users (those that were upserted)
# The `upserted_count` corresponds to how many new documents were inserted
for i, user in enumerate(users):
if result.upserted_count > 0 and i < result.upserted_count:
new_users.append(user)
else:
old_users.append(user)
return old_users, new_users
user_repository = UserRepository()

View File

@ -2,34 +2,36 @@ from __future__ import annotations
import re import re
from threading import Lock from threading import Lock
import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .scrapers.site import Site
from .scrapers.goozali import GoozaliScraper from jobs import (
Enum,
from .jobs import JobPost, JobType, Location JobType,
from .scrapers.utils import set_logger_level, extract_salary, create_logger JobResponse,
from .scrapers.indeed import IndeedScraper Country,
from .scrapers.ziprecruiter import ZipRecruiterScraper JobPost,
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
ZipRecruiterException,
GlassdoorException,
GoogleJobsException,
) )
from model.User import User
from .glassdoor import GlassdoorScraper
from .google import GoogleJobsScraper
from .goozali import GoozaliScraper
from .indeed import IndeedScraper
from .linkedin import LinkedInScraper
from .scraper_input import ScraperInput
from .site import Site
from .utils import set_logger_level, create_logger
from .ziprecruiter import ZipRecruiterScraper
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None, site_name: str | list[str] | Site | list[Site] | None = None,
user: User = None,
search_term: str | None = None, search_term: str | None = None,
google_search_term: str | None = None, google_search_term: str | None = None,
location: str | None = None, location: str | None = None,
@ -55,7 +57,7 @@ def scrape_jobs(
) -> (list[JobPost], list[JobPost]): ) -> (list[JobPost], list[JobPost]):
""" """
Simultaneously scrapes job data from multiple job sites. Simultaneously scrapes job data from multiple job sites.
:return: pandas dataframe containing job data :return: list of jobPost, list of new jobPost
""" """
SCRAPER_MAPPING = { SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper, Site.LINKEDIN: LinkedInScraper,
@ -93,6 +95,7 @@ def scrape_jobs(
country_enum = Country.from_string(country_indeed) country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput( scraper_input = ScraperInput(
user=user,
site_type=get_site_type(), site_type=get_site_type(),
country=country_enum, country=country_enum,
search_term=search_term, search_term=search_term,
@ -111,7 +114,7 @@ def scrape_jobs(
hours_old=hours_old hours_old=hours_old
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert) scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
@ -166,6 +169,10 @@ def scrape_jobs(
""" """
filtered_jobs = [] filtered_jobs = []
remaining_jobs = [] remaining_jobs = []
if not filter_by_title:
return filtered_jobs, remaining_jobs
for job in jobs: for job in jobs:
for filter_title in filter_by_title: for filter_title in filter_by_title:
if re.search(filter_title, job.title, re.IGNORECASE): if re.search(filter_title, job.title, re.IGNORECASE):

View File

@ -1,5 +1,5 @@
""" """
jobspy.scrapers.exceptions scrapers.exceptions
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains the set of Scrapers' exceptions. This module contains the set of Scrapers' exceptions.

View File

@ -1,5 +1,5 @@
""" """
jobspy.scrapers.glassdoor scrapers.glassdoor
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
@ -7,7 +7,6 @@ This module contains routines to scrape Glassdoor.
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass
import re import re
import json import json
import requests import requests
@ -18,14 +17,16 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from .GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type from .GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type
from .constants import fallback_token, query_template, headers from .constants import fallback_token, query_template, headers
from .. import Scraper, ScraperInput, Site from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import extract_emails_from_text, create_logger from ..utils import extract_emails_from_text, create_logger
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import ( from ..utils import (
create_session, create_session,
markdown_converter, markdown_converter,
) )
from ...jobs import ( from jobs import (
JobPost, JobPost,
Compensation, Compensation,
CompensationInterval, CompensationInterval,

View File

@ -1,5 +1,5 @@
""" """
jobspy.scrapers.google scrapers.google
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Google. This module contains routines to scrape Google.
@ -14,12 +14,14 @@ from typing import Tuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from .constants import headers_jobs, headers_initial, async_param from .constants import headers_jobs, headers_initial, async_param
from .. import Scraper, ScraperInput, Site from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import extract_emails_from_text, create_logger, extract_job_type from ..utils import extract_emails_from_text, create_logger, extract_job_type
from ..utils import ( from ..utils import (
create_session, create_session,
) )
from ...jobs import ( from jobs import (
JobPost, JobPost,
JobResponse, JobResponse,
Location, Location,

View File

@ -1,7 +1,7 @@
from datetime import datetime from datetime import datetime
import json import json
from jobspy.jobs import JobPost, Location from jobs import JobPost, Location
from .model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliResponseData from .model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliResponseData
from .constants import job_post_column_to_goozali_column, job_post_column_names from .constants import job_post_column_to_goozali_column, job_post_column_names

View File

@ -1,6 +1,6 @@
from datetime import datetime, timedelta from datetime import datetime, timedelta
from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliFieldChoice
from ..utils import create_logger from ..utils import create_logger
# Mapping function to convert parsed dictionary into GoozaliResponseData # Mapping function to convert parsed dictionary into GoozaliResponseData
@ -13,12 +13,20 @@ class GoozaliScrapperComponent:
pass pass
# Function to filter GoozaliRows based on hours old # Function to filter GoozaliRows based on hours old
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn, column_choice: GoozaliColumnChoice) -> list[GoozaliRow]: def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn,
column_choices: list[GoozaliColumnChoice]) -> list[GoozaliRow]:
return [ return [
row for row in rows row
if row.cellValuesByColumnId[column.id] == column_choice.id for row in rows
if row.cellValuesByColumnId.get(column.id)
and any(choice.id == row.cellValuesByColumnId[column.id] for choice in column_choices)
] ]
# return [
# row for row in rows
# if row.cellValuesByColumnId[column.id] == column_choice.id
# ]
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]: def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
# Current time # Current time
now = datetime.now() now = datetime.now()
@ -39,14 +47,20 @@ class GoozaliScrapperComponent:
if (column.name == column_name): if (column.name == column_name):
return column return column
def find_choice_from_column(self, column: GoozaliColumn, choice_name: str) -> GoozaliColumnChoice: def find_choices_from_column(self, column: GoozaliColumn, choices: list[GoozaliFieldChoice]) -> list[
GoozaliColumnChoice]:
if not column.typeOptions.choices: if not column.typeOptions.choices:
logger.exception(f"Choices for column {column.name} doesn't exist") logger.exception(f"Choices for column {column.name} doesn't exist")
raise Exception(f"Choices for column {column.name} doesn't exist") raise Exception(f"Choices for column {column.name} doesn't exist")
chosen_values = [c.value for c in choices]
goozali_column_choices = []
for key, choice in column.typeOptions.choices.items(): for key, choice in column.typeOptions.choices.items():
if (choice.name == choice_name): if choice.name in chosen_values:
return choice goozali_column_choices.append(choice)
logger.exception(f"Can't find {choice_name} for column {column.name}") if len(goozali_column_choices) == 0:
raise Exception(f"Can't find {choice_name} for column {column.name}") logger.exception(f"Can't find {choices} for column {column.name}")
raise Exception(f"Can't find {choices} for column {column.name}")
return goozali_column_choices

View File

@ -1,5 +1,5 @@
""" """
jobspy.scrapers.Goozali scrapers.Goozali
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Goozali. This module contains routines to scrape Goozali.
@ -7,20 +7,20 @@ This module contains routines to scrape Goozali.
from __future__ import annotations from __future__ import annotations
from jobs import (
from .. import Scraper, ScraperInput
from .GoozaliMapper import GoozaliMapper
from .GoozaliScrapperComponent import GoozaliScrapperComponent
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column
from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, GoozaliFullRequest
from ..site import Site
from ..utils import create_dict_by_key_and_value, create_session, create_logger
from ...jobs import (
JobPost, JobPost,
JobResponse, JobResponse,
) )
logger = create_logger("Goozali") from .GoozaliMapper import GoozaliMapper
from .GoozaliScrapperComponent import GoozaliScrapperComponent
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map
from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, GoozaliFullRequest
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import create_dict_by_key_and_value, create_session, create_logger
logger = create_logger("GoozaliScraper")
class GoozaliScraper(Scraper): class GoozaliScraper(Scraper):
@ -72,22 +72,19 @@ class GoozaliScraper(Scraper):
except Exception as e: except Exception as e:
logger.error(f"Exception: {str(e)}") logger.error(f"Exception: {str(e)}")
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
# model the response with models
goozali_response = self.mapper.map_response_to_goozali_response( goozali_response = self.mapper.map_response_to_goozali_response(
response=response) response=response)
# suggestL create groupby field and then filter by hours
# filter result by Field
column = self.component.find_column( column = self.component.find_column(
goozali_response.data.columns, job_post_column_to_goozali_column["field"]) goozali_response.data.columns, job_post_column_to_goozali_column["field"])
column_choice = self.component.find_choice_from_column( user_goozali_fields = position_to_goozali_field_map[scraper_input.user.position]
column, GoozaliFieldChoice.SOFTWARE_ENGINEERING.value) column_choices = self.component.find_choices_from_column(
column, user_goozali_fields)
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice( filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
goozali_response.data.rows, column, column_choice) goozali_response.data.rows, column, column_choices)
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours( filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
filtered_rows_by_column_choice, scraper_input.hours_old) filtered_rows_by_column_choice, scraper_input.hours_old)
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value( dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(
goozali_response.data.columns, extract_goozali_column_name) goozali_response.data.columns, extract_goozali_column_name)
# map to JobResponse Object
for row in filtered_rows_by_age_and_column_choice: for row in filtered_rows_by_age_and_column_choice:
job_post = self.mapper.map_goozali_response_to_job_post( job_post = self.mapper.map_goozali_response_to_job_post(
row, dict_column_name_to_column) row, dict_column_name_to_column)

View File

@ -0,0 +1,92 @@
from model.Position import Position
from .model import GoozaliColumn, GoozaliFieldChoice
job_post_column_to_goozali_column = {
"date_posted": "Discovered",
"field": "Field",
"title": "Job Title",
"job_url": "Position Link",
"company_name": "Company",
"description": "Requirements",
"location": "Location",
"company_industry": "Company Industry",
"id": "Job ID"
}
job_post_column_names = ["id",
"date_posted",
"field",
"title",
"job_url",
"company_name",
"description",
"location",
"company_industry"]
fields = ["Product Management",
"Data Analyst",
"Data Science, ML & Algorithms",
"Software Engineering",
"QA",
"Cybersecurity",
"IT and System Administration",
"Frontend Development",
"DevOps",
"UI/UX, Design & Content",
"HR & Recruitment",
"Mobile Development",
"Hardware Engineering",
"Embedded, Low Level & Firmware Engineering",
"Customer Success",
"Project Management",
"Operations",
"Finance",
"Systems Engineering",
"Marketing",
"Sales",
"Compliance, Legal & Policy",
"C-Level",
"Business Development",
"Mechanical Engineering",
"Natural Science",
"Other"]
def create_position_to_goozali_field_map():
"""
Creates a map with Position as keys and a list of relevant GoozaliFieldChoice as values.
Returns:
dict: A dictionary mapping Position to a list of GoozaliFieldChoice.
"""
position_to_goozali_map = {
Position.BACKEND_DEVELOPER: [GoozaliFieldChoice.SOFTWARE_ENGINEERING],
Position.FULLSTACK_DEVELOPER: [GoozaliFieldChoice.SOFTWARE_ENGINEERING],
Position.FRONTEND_DEVELOPER: [GoozaliFieldChoice.FRONTEND_DEVELOPMENT, GoozaliFieldChoice.SOFTWARE_ENGINEERING],
Position.DATA_SCIENTIST: [GoozaliFieldChoice.DATA_SCIENCE_ML_ALGORITHMS],
Position.DATA_ANALYST: [GoozaliFieldChoice.DATA_ANALYST],
Position.PROJECT_MANAGER: [GoozaliFieldChoice.PROJECT_MANAGEMENT],
Position.CLOUD_ENGINEER: [GoozaliFieldChoice.DEVOPS, GoozaliFieldChoice.IT_AND_SYSTEM_ADMINISTRATION],
Position.CLOUD_ARCHITECT: [GoozaliFieldChoice.DEVOPS, GoozaliFieldChoice.IT_AND_SYSTEM_ADMINISTRATION],
Position.UX_UI_DESIGNER: [GoozaliFieldChoice.UI_UX_DESIGN_CONTENT],
Position.PRODUCT_MANAGER: [GoozaliFieldChoice.PRODUCT_MANAGEMENT],
Position.DEV_OPS_ENGINEER: [GoozaliFieldChoice.DEVOPS],
Position.BUSINESS_ANALYST: [GoozaliFieldChoice.BUSINESS_DEVELOPMENT],
Position.CYBERSECURITY_ENGINEER: [GoozaliFieldChoice.CYBERSECURITY],
Position.MACHINE_LEARNING_ENGINEER: [GoozaliFieldChoice.DATA_SCIENCE_ML_ALGORITHMS],
Position.ARTIFICIAL_INTELLIGENCE_ENGINEER: [GoozaliFieldChoice.DATA_SCIENCE_ML_ALGORITHMS],
Position.DATABASE_ADMINISTRATOR: [GoozaliFieldChoice.IT_AND_SYSTEM_ADMINISTRATION],
Position.SYSTEMS_ADMINISTRATOR: [GoozaliFieldChoice.IT_AND_SYSTEM_ADMINISTRATION],
Position.NETWORK_ENGINEER: [GoozaliFieldChoice.IT_AND_SYSTEM_ADMINISTRATION],
Position.TECHNICAL_SUPPORT_SPECIALIST: [GoozaliFieldChoice.IT_AND_SYSTEM_ADMINISTRATION],
Position.SALES_ENGINEER: [GoozaliFieldChoice.SALES],
Position.SCRUM_MASTER: [GoozaliFieldChoice.PROJECT_MANAGEMENT],
Position.IT_MANAGER: [GoozaliFieldChoice.IT_AND_SYSTEM_ADMINISTRATION],
}
return position_to_goozali_map
# Get the map
position_to_goozali_field_map = create_position_to_goozali_field_map()
# Key mapper: Extract 'name' as the key
def extract_goozali_column_name(column): return column.name if isinstance(
column, GoozaliColumn) else None

View File

@ -6,13 +6,13 @@ class GoozaliFullRequest():
self.view_id: str = "viwIOzPYaUGxlA0Jd" self.view_id: str = "viwIOzPYaUGxlA0Jd"
self.url = base_url.format(view_id=self.view_id) self.url = base_url.format(view_id=self.view_id)
self.application_id: str = "appwewqLk7iUY4azc" self.application_id: str = "appwewqLk7iUY4azc"
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s" self.air_table_page_load_id: str = "pglke45UFwdvQgBNJ"
self.stringifiedObjectParams = { self.stringifiedObjectParams = {
"shouldUseNestedResponseFormat": "true"} "shouldUseNestedResponseFormat": "true"}
self.cookies: dict[str, str] = {} self.cookies: dict[str, str] = {}
self.request_id: str = "req4q4tKw3woEEWxw&" self.request_id: str = "reqGjlEjOQFyRssam"
self.share_id: str = "shrQBuWjXd0YgPqV6" self.share_id: str = "shrQBuWjXd0YgPqV6"
self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59" self.signature: str = "7a1402a3f7f6f9a23c8db3849878812f2d3141da60f3b3d6e14dd4a910b91b74"
self.headers = self._generate_headers() self.headers = self._generate_headers()
self.params = self._generate_params() self.params = self._generate_params()
self.cookies = {} self.cookies = {}
@ -66,7 +66,7 @@ class GoozaliFullRequest():
"shareId": self.share_id, "shareId": self.share_id,
"applicationId": self.application_id, "applicationId": self.application_id,
"generationNumber": 0, "generationNumber": 0,
"expires": "2025-01-02T00:00:00.000Z", "expires": "2025-01-30T00:00:00.000Z",
"signature": self.signature "signature": self.signature
} }
# Convert to a JSON string # Convert to a JSON string

View File

@ -1,5 +1,5 @@
""" """
jobspy.scrapers.indeed scrapers.indeed
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Indeed. This module contains routines to scrape Indeed.
@ -12,7 +12,9 @@ from typing import Tuple
from datetime import datetime from datetime import datetime
from .constants import job_search_query, api_headers from .constants import job_search_query, api_headers
from .. import Scraper, ScraperInput, Site from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import ( from ..utils import (
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
@ -20,7 +22,7 @@ from ..utils import (
create_session, create_session,
create_logger, create_logger,
) )
from ...jobs import ( from jobs import (
JobPost, JobPost,
Compensation, Compensation,
CompensationInterval, CompensationInterval,

View File

@ -1,5 +1,5 @@
""" """
jobspy.scrapers.linkedin scrapers.linkedin
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape LinkedIn. This module contains routines to scrape LinkedIn.
@ -17,13 +17,15 @@ from datetime import datetime
from bs4.element import Tag from bs4.element import Tag
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote from urllib.parse import urlparse, urlunparse, unquote
from requests.exceptions import RetryError, RequestException from requests.exceptions import RetryError
from urllib3.exceptions import MaxRetryError from urllib3.exceptions import MaxRetryError
from .constants import headers from .constants import headers
from .. import Scraper, ScraperInput, Site from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ..utils import create_session, remove_attributes, create_logger from ..utils import create_session, remove_attributes, create_logger
from ...jobs import ( from jobs import (
JobPost, JobPost,
Location, Location,
JobResponse, JobResponse,

17
src/scrapers/scraper.py Normal file
View File

@ -0,0 +1,17 @@
from abc import ABC, abstractmethod
from jobs import JobResponse
from scrapers.site import Site
from scrapers.scraper_input import ScraperInput
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -1,25 +1,13 @@
from __future__ import annotations from pydantic import BaseModel
from abc import ABC, abstractmethod from jobs import Country, JobType, DescriptionFormat
from model.User import User
from .site import Site from scrapers.site import Site
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat,
)
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
site_type: list[Site] site_type: list[Site]
user: User
search_term: str | None = None search_term: str | None = None
google_search_term: str | None = None google_search_term: str | None = None
@ -37,15 +25,3 @@ class ScraperInput(BaseModel):
results_wanted: int = 15 results_wanted: int = 15
hours_old: int | None = None hours_old: int | None = None
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -11,11 +11,11 @@ import numpy as np
from markdownify import markdownify as md from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType from jobs import CompensationInterval, JobType
def create_logger(name: str): def create_logger(name: str):
logger = logging.getLogger(f"JobSpy:{name}") logger = logging.getLogger(f"JobSeekerTG:{name}")
logger.propagate = False logger.propagate = False
if not logger.handlers: if not logger.handlers:
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
@ -143,7 +143,7 @@ def set_logger_level(verbose: int = 2):
level = getattr(logging, level_name.upper(), None) level = getattr(logging, level_name.upper(), None)
if level is not None: if level is not None:
for logger_name in logging.root.manager.loggerDict: for logger_name in logging.root.manager.loggerDict:
if logger_name.startswith("JobSpy:"): if logger_name.startswith("JobSeekerTG:"):
logging.getLogger(logger_name).setLevel(level) logging.getLogger(logger_name).setLevel(level)
else: else:
raise ValueError(f"Invalid log level: {level_name}") raise ValueError(f"Invalid log level: {level_name}")

View File

@ -1,5 +1,5 @@
""" """
jobspy.scrapers.ziprecruiter scrapers.ziprecruiter
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape ZipRecruiter. This module contains routines to scrape ZipRecruiter.
@ -19,7 +19,9 @@ from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .constants import headers from .constants import headers
from .. import Scraper, ScraperInput, Site from ..site import Site
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..utils import ( from ..utils import (
extract_emails_from_text, extract_emails_from_text,
create_session, create_session,
@ -27,7 +29,7 @@ from ..utils import (
remove_attributes, remove_attributes,
create_logger, create_logger,
) )
from ...jobs import ( from jobs import (
JobPost, JobPost,
Compensation, Compensation,
Location, Location,

View File

@ -1,4 +1,3 @@
import os
from typing import Union from typing import Union
from dotenv import load_dotenv from dotenv import load_dotenv
@ -6,8 +5,8 @@ from telegram import Bot, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.constants import ReactionEmoji from telegram.constants import ReactionEmoji
from config.settings import settings from config.settings import settings
from jobspy.jobs import JobPost from jobs import JobPost
from jobspy.scrapers.utils import create_logger from scrapers.utils import create_logger
load_dotenv() load_dotenv()

View File

@ -3,8 +3,8 @@ from __future__ import annotations
from telegram import MaybeInaccessibleMessage from telegram import MaybeInaccessibleMessage
from telegram.constants import ReactionEmoji from telegram.constants import ReactionEmoji
from db.job_repository import JobRepository from scrapers import create_logger
from jobspy import create_logger from model.job_repository import job_repository
from telegram_handler.button_callback.button_fire_strategy import FireStrategy from telegram_handler.button_callback.button_fire_strategy import FireStrategy
from telegram_handler.button_callback.button_job_title_strategy import JobTitleStrategy from telegram_handler.button_callback.button_job_title_strategy import JobTitleStrategy
from telegram_handler.button_callback.button_poo_strategy import PooStrategy from telegram_handler.button_callback.button_poo_strategy import PooStrategy
@ -22,7 +22,6 @@ class ButtonCallBackContext:
self._data = data self._data = data
self._job_id = job_id self._job_id = job_id
self._strategy = None self._strategy = None
self._job_repository = JobRepository()
@property @property
def strategy(self) -> ButtonStrategy: def strategy(self) -> ButtonStrategy:
@ -49,7 +48,7 @@ class ButtonCallBackContext:
elif ReactionEmoji.PILE_OF_POO.name == self._data: elif ReactionEmoji.PILE_OF_POO.name == self._data:
self._strategy = PooStrategy(self._message) self._strategy = PooStrategy(self._message)
elif self._data: elif self._data:
job = self._job_repository.find_by_id(self._data) job = job_repository.find_by_id(self._data)
if job: if job:
chat_id = self._message.chat.id chat_id = self._message.chat.id
self._strategy = JobTitleStrategy(chat_id, job) self._strategy = JobTitleStrategy(chat_id, job)

View File

@ -1,8 +1,8 @@
from telegram import MaybeInaccessibleMessage from telegram import MaybeInaccessibleMessage
from telegram.constants import ReactionEmoji from telegram.constants import ReactionEmoji
from db.job_repository import JobRepository from scrapers import create_logger
from jobspy import create_logger from model.job_repository import job_repository
from telegram_bot import TelegramBot from telegram_bot import TelegramBot
from telegram_handler.button_callback.button_strategy import ButtonStrategy from telegram_handler.button_callback.button_strategy import ButtonStrategy
@ -16,16 +16,15 @@ class FireStrategy(ButtonStrategy):
self._message = message self._message = message
self._emoji = ReactionEmoji.FIRE self._emoji = ReactionEmoji.FIRE
self._telegram_bot = TelegramBot() self._telegram_bot = TelegramBot()
self._job_repository = JobRepository()
self._job_id = job_id self._job_id = job_id
self._logger = create_logger("FireStrategy") self._logger = create_logger("FireStrategy")
async def execute(self): async def execute(self):
job = self._job_repository.find_by_id(self._job_id) job = job_repository.find_by_id(self._job_id)
if not job: if not job:
self._logger.error(f"Job with ID {self._job_id} not found.") self._logger.error(f"Job with ID {self._job_id} not found.")
return return
job.applied = True job.applied = True
self._job_repository.update(job) job_repository.update(job)
chat_id = self._message.chat.id chat_id = self._message.chat.id
await self._telegram_bot.set_message_reaction(chat_id, self._message.message_id, self._emoji) await self._telegram_bot.set_message_reaction(chat_id, self._message.message_id, self._emoji)

View File

@ -1,6 +1,6 @@
from typing import Union from typing import Union
from jobspy import JobPost from scrapers import JobPost
from telegram_bot import TelegramBot from telegram_bot import TelegramBot
from telegram_handler.button_callback.button_strategy import ButtonStrategy from telegram_handler.button_callback.button_strategy import ButtonStrategy

View File

@ -3,7 +3,7 @@ from telegram.ext import (
ContextTypes, ContextTypes,
) )
from jobspy import create_logger from scrapers import create_logger
from telegram_bot import TelegramBot from telegram_bot import TelegramBot
from telegram_handler.button_callback.button_callback_context import ButtonCallBackContext from telegram_handler.button_callback.button_callback_context import ButtonCallBackContext

View File

@ -0,0 +1,46 @@
START_MESSAGE: str = "Hi there! I'm JobSeeker Bot, your friendly job search assistant.😊\n" \
"I'm here to help you find the perfect position.\n\n" \
"To stop chatting with me at any time, just send '/cancel'.\n\n"
POSITION_MESSAGE: str = "What kind of position are you looking for? ✨\n" \
"(e.g., Software Engineer, Data Scientist, Marketing Manager)"
POSITION_NOT_FOUND: str = "I couldn't find any positions matching your request. 😕\n" \
"Please try again"
multi_value_message: str = "Enter multiple values separated by commas (e.g., value1, value2, value3) ✍️"
LOCATION_MESSAGE: str = "Where are you hoping to find a position? 🌎\n" \
"(e.g., Rishon Lezion, New York City, San Francisco)\n\n" + multi_value_message
EXPERIENCE_MESSAGE: str = "How many years of professional experience do you have in this field? 💼\n"
EXPERIENCE_INVALID: str = "Oops! Please enter your experience in years as a number.😕" \
"For example, 2, 5, or 10."
JOB_AGE_MESSAGE: str = "How recent should the jobs be? ⏰\n" \
"(Enter the number of hours, e.g., 24 for last 24 hours, 168 for last week)"
# JOB_AGE_MESSAGE: str = "Within how many hours do you want to see jobs posted? ⏰\n" \
# "(Enter a number, e.g., 48 for the last 48 hours)"
JOB_AGE_INVALID: str = "Oops! Please enter a number for the number of hours. 😕\n" \
"For example, 24, 48, or 168."
FILTER_TILE_MESSAGE: str = "To help me narrow down your search, tell me about any NOT relevant tags or keywords.\n" \
"For example: 'remote', 'BI', 'python', 'machine learning', 'QA'.\n\n" + multi_value_message
THANK_YOU_MESSAGE: str = "Thank you for chatting with JobSeeker Bot!\n\n" \
"I can help you find jobs on LinkedIn, Glassdoor, and more."
SEARCH_MESSAGE: str = "To search for jobs on a specific site, simply send the site name:\n" \
"/linkedin\n" \
"/indeed\n" \
"/glassdoor\n" \
"/goozali\n\n" \
"Or, use the command /find to search across all supported job boards for a broader search.\n\n" \
"Let me know how I can assist you further! 😊"
BYE_MESSAGE: str = "Have a great day!✨\n" \
"I hope to assist you with your job search in the future.😊"
VERIFY_MESSAGE: str = "Did you choose: %s ? 🧐"

View File

@ -4,9 +4,10 @@ from telegram.ext import (
ContextTypes, ContextTypes,
) )
from db.job_repository import JobRepository from scrapers import Site, scrape_jobs, JobPost
from jobspy import Site, scrape_jobs, JobPost from scrapers.utils import create_logger
from jobspy.scrapers.utils import create_logger from model.job_repository import JobRepository
from model.user_repository import user_repository
from telegram_bot import TelegramBot from telegram_bot import TelegramBot
from telegram_handler.telegram_handler import TelegramHandler from telegram_handler.telegram_handler import TelegramHandler
@ -33,11 +34,8 @@ def map_jobs_to_keyboard(jobs: list[JobPost]) -> InlineKeyboardMarkup:
class TelegramDefaultHandler(TelegramHandler): class TelegramDefaultHandler(TelegramHandler):
def __init__(self, sites: list[Site], locations: list[str], title_filters: list[str], search_term: str): def __init__(self, sites: list[Site]):
self.sites_to_scrap = sites self.sites_to_scrap = sites
self.locations = locations
self.search_term = search_term
self.title_filters = title_filters
self.telegram_bot = TelegramBot() self.telegram_bot = TelegramBot()
self.jobRepository = JobRepository() self.jobRepository = JobRepository()
if len(sites) == 1: if len(sites) == 1:
@ -51,17 +49,21 @@ class TelegramDefaultHandler(TelegramHandler):
chat_id = update.message.chat.id chat_id = update.message.chat.id
await self.telegram_bot.set_message_reaction(chat_id, await self.telegram_bot.set_message_reaction(chat_id,
update.message.message_id, ReactionEmoji.FIRE) update.message.message_id, ReactionEmoji.FIRE)
user = user_repository.find_by_username(update.message.from_user.username)
site_names = [site.name for site in self.sites_to_scrap] site_names = [site.name for site in self.sites_to_scrap]
site_names_print = ", ".join(site_names) site_names_print = ", ".join(site_names)
# locations = [location + ", Israel" for location in user.cities]
await self.telegram_bot.send_text(chat_id, await self.telegram_bot.send_text(chat_id,
f"Start scarping: {site_names_print}") f"Start scarping: {site_names_print}")
filtered_out_jobs, jobs = scrape_jobs( filtered_out_jobs, jobs = scrape_jobs(
site_name=self.sites_to_scrap, site_name=self.sites_to_scrap,
search_term=self.search_term, user=user,
locations=self.locations, search_term=user.position.value,
locations=user.cities,
results_wanted=200, results_wanted=200,
hours_old=48, hours_old=int(user.job_age),
filter_by_title=self.title_filters, filter_by_title=user.title_filters,
country_indeed='israel' country_indeed='israel'
) )
self.logger.info(f"Found {len(jobs)} jobs") self.logger.info(f"Found {len(jobs)} jobs")

View File

@ -0,0 +1,29 @@
from telegram import Update
from telegram.constants import ReactionEmoji
from telegram.ext import (
ContextTypes,
)
from scrapers.utils import create_logger
from model.user_repository import user_repository
from telegram_bot import TelegramBot
from telegram_handler.telegram_handler import TelegramHandler
class MyInfoTelegramHandler(TelegramHandler):
def __init__(self):
self.telegram_bot = TelegramBot()
self._logger = create_logger("MyInfoTelegramHandler")
async def handle(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
self._logger.info("start handling")
chat_id = update.message.chat.id
await self.telegram_bot.set_message_reaction(chat_id,
update.message.message_id, ReactionEmoji.FIRE)
user = user_repository.find_by_username(update.message.from_user.username)
await self.telegram_bot.send_text(chat_id, user.get_myinfo_message())
self._logger.info("finished handling")
my_info_handler = MyInfoTelegramHandler()

View File

@ -0,0 +1,216 @@
from enum import Enum
from telegram import Update, Chat, KeyboardButton, ReplyKeyboardMarkup, ReplyKeyboardRemove
from telegram.constants import ReactionEmoji
from telegram.ext import (
ContextTypes, ConversationHandler, CommandHandler, MessageHandler, filters,
)
from config.cache_manager import cache_manager
from model.Position import Position
from model.User import User
from model.user_repository import user_repository
from scrapers.utils import create_logger
from telegram_bot import TelegramBot
from telegram_handler.start_handler_constats import START_MESSAGE, POSITION_MESSAGE, POSITION_NOT_FOUND, \
LOCATION_MESSAGE, EXPERIENCE_MESSAGE, FILTER_TILE_MESSAGE, THANK_YOU_MESSAGE, BYE_MESSAGE, VERIFY_MESSAGE, \
SEARCH_MESSAGE, EXPERIENCE_INVALID, JOB_AGE_INVALID, JOB_AGE_MESSAGE
class Flow(Enum):
POSITION = 0
ADDRESS = 1
FILTERS = 2
EXPERIENCE = 3
VERIFY_ADDRESS = 4
VERIFY_FILTERS = 5
SKIP_FILTERS = 6
JOB_AGE = 7
class TelegramStartHandler:
def __init__(self):
self.telegram_bot = TelegramBot()
self.logger = create_logger("TelegramStartHandler")
async def start(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Starts the conversation and asks the user about their position."""
chat: Chat = update.message.chat
user = user_repository.find_by_username(chat.username)
if not user:
user = User(full_name=chat.full_name, username=chat.username, chat_id=chat.id)
user_repository.insert_user(user)
await update.message.reply_text(START_MESSAGE)
buttons = [[KeyboardButton(position.value)] for position in Position]
reply_markup = ReplyKeyboardMarkup(buttons, one_time_keyboard=True,
input_field_placeholder=Flow.POSITION.name)
await update.message.reply_text(
POSITION_MESSAGE,
reply_markup=reply_markup,
)
return Flow.POSITION.value
async def position(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Stores the selected position and asks for a locations."""
user = update.message.from_user
self.logger.info("Position of %s: %s", user.first_name, update.message.text)
position = next((p for p in Position if p.value == update.message.text), None)
if not position:
await update.message.set_reaction(ReactionEmoji.PILE_OF_POO)
await update.message.reply_text(POSITION_NOT_FOUND)
buttons = [[KeyboardButton(position.value)] for position in Position]
reply_markup = ReplyKeyboardMarkup(buttons, one_time_keyboard=True,
input_field_placeholder=Flow.POSITION.name)
await update.message.reply_text(
POSITION_MESSAGE,
reply_markup=reply_markup,
)
return Flow.POSITION.value
await update.message.set_reaction(ReactionEmoji.FIRE)
cached_user: User = cache_manager.find(user.username)
cached_user.position = position
cache_manager.save(cached_user.username, cached_user)
await update.message.reply_text(LOCATION_MESSAGE)
return Flow.ADDRESS.value
async def address(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Asks for a location."""
cities = update.message.text.split(",")
# Remove leading/trailing spaces from each city name
cities = [city.strip() for city in cities]
await update.message.set_reaction(ReactionEmoji.FIRE)
reply_markup = ReplyKeyboardMarkup([[KeyboardButton("Yes"), KeyboardButton("No")]], one_time_keyboard=True,
input_field_placeholder=Flow.VERIFY_ADDRESS.name)
await update.message.reply_text(VERIFY_MESSAGE % cities, reply_markup=reply_markup)
cached_user: User = cache_manager.find(update.message.from_user.username)
cached_user.cities = cities
cache_manager.save(cached_user.username, cached_user)
return Flow.VERIFY_ADDRESS.value
async def verify_address(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Verify for a Address."""
if update.message.text == "No":
await update.message.set_reaction(ReactionEmoji.PILE_OF_POO)
await update.message.reply_text(LOCATION_MESSAGE)
return Flow.ADDRESS.value
await update.message.set_reaction(ReactionEmoji.FIRE)
await update.message.reply_text(EXPERIENCE_MESSAGE)
return Flow.EXPERIENCE.value
async def experience(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Asks for a experience."""
user = update.message.from_user
self.logger.info("Experience of %s: %s", user.first_name, update.message.text)
if not update.message.text.isnumeric():
await update.message.set_reaction(ReactionEmoji.PILE_OF_POO)
await update.message.reply_text(EXPERIENCE_INVALID)
await update.message.reply_text(EXPERIENCE_MESSAGE)
return Flow.EXPERIENCE.value
await update.message.set_reaction(ReactionEmoji.FIRE)
cached_user: User = cache_manager.find(update.message.from_user.username)
cached_user.experience = update.message.text
cache_manager.save(cached_user.username, cached_user)
await update.message.reply_text(JOB_AGE_MESSAGE)
return Flow.JOB_AGE.value
async def job_age(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Asks for a Job age in hours."""
await update.message.set_reaction(ReactionEmoji.FIRE)
user = update.message.from_user
self.logger.info("Job age of %s: %s", user.first_name, update.message.text)
if not update.message.text.isnumeric():
await update.message.set_reaction(ReactionEmoji.PILE_OF_POO)
await update.message.reply_text(JOB_AGE_INVALID)
await update.message.reply_text(JOB_AGE_MESSAGE)
return Flow.JOB_AGE.value
await update.message.set_reaction(ReactionEmoji.FIRE)
cached_user: User = cache_manager.find(update.message.from_user.username)
cached_user.job_age = update.message.text
cache_manager.save(cached_user.username, cached_user)
await update.message.reply_text(
FILTER_TILE_MESSAGE)
return Flow.FILTERS.value
async def filters_flow(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Asks for a filters_flow."""
await update.message.set_reaction(ReactionEmoji.FIRE)
title_filters = update.message.text.split(",")
# Remove leading/trailing spaces from each city name
title_filters = [title_filter.strip() for title_filter in title_filters]
reply_markup = ReplyKeyboardMarkup([[KeyboardButton("Yes"), KeyboardButton("No")]], one_time_keyboard=True,
input_field_placeholder=Flow.VERIFY_FILTERS.name)
await update.message.reply_text(VERIFY_MESSAGE % title_filters, reply_markup=reply_markup)
cached_user: User = cache_manager.find(update.message.from_user.username)
cached_user.title_filters = title_filters
cache_manager.save(cached_user.username, cached_user)
return Flow.VERIFY_FILTERS.value
async def verify_filter(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Verify for a filters_flow."""
if update.message.text == "No":
await update.message.set_reaction(ReactionEmoji.PILE_OF_POO)
await update.message.reply_text(FILTER_TILE_MESSAGE)
return Flow.FILTERS.value
await update.message.set_reaction(ReactionEmoji.FIRE)
await update.message.reply_text(THANK_YOU_MESSAGE)
await update.message.reply_text(SEARCH_MESSAGE)
cached_user: User = cache_manager.find(update.message.from_user.username)
user_repository.update(cached_user)
return ConversationHandler.END
async def skip_filter(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Skips the location and asks for info about the user."""
await update.message.set_reaction(ReactionEmoji.FIRE)
user = update.message.from_user
self.logger.info("User %s did not send a filters.", user.first_name)
await update.message.reply_text(THANK_YOU_MESSAGE)
await update.message.reply_text(SEARCH_MESSAGE)
return ConversationHandler.END
async def cancel(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> int:
"""Cancels and ends the conversation."""
await update.message.set_reaction(ReactionEmoji.FIRE)
user = update.message.from_user
self.logger.info("User %s canceled the conversation.", user.first_name)
await update.message.reply_text(
BYE_MESSAGE, reply_markup=ReplyKeyboardRemove()
)
cached_user: User = cache_manager.find(user.username)
user_repository.update(cached_user.username, cached_user)
return ConversationHandler.END
start_handler = TelegramStartHandler()
start_conv_handler = ConversationHandler(
entry_points=[CommandHandler("start", start_handler.start)],
states={
Flow.POSITION.value: [MessageHandler(filters.TEXT, start_handler.position)],
Flow.ADDRESS.value: [MessageHandler(filters.TEXT, start_handler.address)],
Flow.VERIFY_ADDRESS.value: [MessageHandler(filters.TEXT, start_handler.verify_address)],
Flow.EXPERIENCE.value: [MessageHandler(filters.TEXT, start_handler.experience)],
Flow.JOB_AGE.value: [MessageHandler(filters.TEXT, start_handler.job_age)],
Flow.FILTERS.value: [MessageHandler(filters.TEXT, start_handler.filters_flow)],
Flow.VERIFY_FILTERS.value: [MessageHandler(filters.TEXT, start_handler.verify_filter)],
},
fallbacks=[CommandHandler("cancel", start_handler.cancel)],
)

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs from scrapers import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,6 +1,6 @@
from dotenv import load_dotenv from dotenv import load_dotenv
from db.job_repository import JobRepository from model.job_repository import JobRepository
from tests.test_util import createMockJob from tests.test_util import createMockJob
load_dotenv() load_dotenv()

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs from scrapers import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs from scrapers import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,12 +1,12 @@
import json import json
import os import os
from jobspy.jobs import JobPost from jobs import JobPost
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper from scrapers.goozali.GoozaliMapper import GoozaliMapper
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent from scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
from jobspy.scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column from scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column
from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData from scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData
from jobspy.scrapers.utils import create_dict_by_key_and_value from scrapers.utils import create_dict_by_key_and_value
# URL Example # URL Example
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D # https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs from scrapers import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs from scrapers import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,7 +1,7 @@
from datetime import datetime, date from datetime import datetime, date
from typing import List from typing import List
from jobspy import JobPost, Location, Country from scrapers import JobPost, Location, Country
# Creating some test job posts # Creating some test job posts

View File

@ -1,4 +1,4 @@
from jobspy import scrape_jobs from scrapers import scrape_jobs
import pandas as pd import pandas as pd