Add complete job search functionality with proxy support and security improvements

pull/240/head
Daniel 2025-02-04 17:05:23 -07:00
parent 333e9e6760
commit 144ede9e59
7 changed files with 717 additions and 10 deletions

47
.gitignore vendored
View File

@ -1,10 +1,37 @@
/venv/
/.idea
**/__pycache__/
**/.pytest_cache/
/.ipynb_checkpoints/
**/output/
**/.DS_Store
*.pyc
.env
dist
# Sensitive configuration
config_sensitive.py
# Generated files
*.csv
*.log
# Python
__pycache__/
*.py[cod]
*$py.class
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
ENV/
# IDE
.idea/
.vscode/
*.swp
*.swo

23
config.py Normal file
View File

@ -0,0 +1,23 @@
# Default configuration that can be committed
DEFAULT_CONFIG = {
'search_term': 'IT Engineer',
'location': 'Lone Tree, CO',
'distance': 25,
'results_wanted': 50,
'job_type': 'fulltime',
'hours_old': 72,
'search_sites': ["indeed", "linkedin"],
'exclude_clearance': True,
'clearance_keywords': [
'clearance', 'security clearance', 'secret', 'top secret',
'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
'public trust', 'security+', 'security plus'
]
}
try:
# Try to import sensitive config from a local file
from .config_sensitive import SENSITIVE_CONFIG
except ImportError:
print("Warning: No sensitive configuration found. Using defaults.")

View File

@ -0,0 +1,49 @@
"""
JobSpy Sensitive Configuration Template
=====================================
Setup Instructions:
1. Copy this file to 'config_sensitive.py'
2. Fill in your actual values
3. Keep config_sensitive.py in .gitignore
Security Best Practices:
- Never commit config_sensitive.py to version control
- Store proxy credentials securely
- Rotate credentials regularly
- Use environment variables when possible
"""
SENSITIVE_CONFIG = {
'proxy_enabled': True, # Set to False to disable proxy usage
# Add your proxy URLs here (at least one required if proxy_enabled is True)
'proxy_list': [
"http://your-username:your-password@your-proxy-host:port",
"http://your-backup-proxy-url:port" # Optional backup proxy
],
# IP verification services (can be customized)
'proxy_verification_urls': [
'http://api.ipify.org?format=json',
'http://ip-api.com/json',
'http://ifconfig.me/ip'
],
# Advanced Settings
'proxy_timeout': 10, # Seconds to wait for proxy response
'max_retries': 3, # Maximum retry attempts per proxy
'rotate_interval': 100, # Rotate proxy after N requests
'verify_ssl': False # Disable for some proxy configurations
}
"""
Example format for proxy_list entries:
- Bright Data format: "http://brd-customer-[username]-zone-[zone_name]:[password]@brd.superproxy.io:22225"
- Generic format: "http://username:password@host:port"
Security Notes:
1. Never commit config_sensitive.py to version control
2. Keep your proxy credentials secure
3. Regularly rotate proxy credentials if possible
"""

109
job_search.py Normal file
View File

@ -0,0 +1,109 @@
from jobspy import scrape_jobs
import time
import certifi
import pandas as pd
import csv
from datetime import datetime
from typing import Optional, List
def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
"""Filter out jobs requiring security clearance"""
clearance_keywords = [
'clearance', 'security clearance', 'secret', 'top secret',
'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
'public trust', 'security+', 'security plus'
]
# Create a pattern matching any clearance keyword
pattern = '|'.join(clearance_keywords)
# Filter out jobs where title or description contains clearance keywords
mask = ~(
df['title'].str.lower().str.contains(pattern, na=False) |
df['description'].str.lower().str.contains(pattern, na=False)
)
return df[mask]
def search_tech_jobs(
search_sites: List[str] = ["indeed", "linkedin"],
exclude_clearance: bool = False
) -> Optional[pd.DataFrame]:
# Search configuration
search_config = {
'search_term': 'IT Engineer',
'location': 'Lone Tree, CO',
'distance': 25,
'results_wanted': 50,
'job_type': 'fulltime',
'hours_old': 72
}
try:
print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
print(f"Distance: {search_config['distance']} miles")
print(f"Job Type: {search_config['job_type']}")
print(f"Posts from last: {search_config['hours_old']} hours")
print(f"Excluding clearance jobs: {exclude_clearance}")
print(f"Searching on: {', '.join(search_sites)}")
jobs = scrape_jobs(
site_name=search_sites,
search_term=search_config['search_term'],
location=search_config['location'],
distance=search_config['distance'],
results_wanted=search_config['results_wanted'],
job_type=search_config['job_type'],
hours_old=search_config['hours_old'],
country_indeed="USA",
description_format="markdown",
verbose=2
)
if isinstance(jobs, pd.DataFrame) and not jobs.empty:
print(f"\nInitial jobs found: {len(jobs)}")
if exclude_clearance:
original_count = len(jobs)
jobs = filter_clearance_jobs(jobs)
filtered_count = len(jobs)
print(f"Removed {original_count - filtered_count} jobs requiring clearance")
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"it_jobs_{timestamp}.csv"
# Print job summary
print("\nJob Listings Found:")
print("-------------------")
for idx, job in jobs.iterrows():
print(f"\n{idx + 1}. {job.get('title', 'No title')}")
print(f" Company: {job.get('company', 'No company')}")
print(f" Location: {job.get('location', 'No location')}")
print(f" Source: {job.get('site', 'No source')}")
print(f" Date Posted: {job.get('date_posted', 'No date')}")
jobs.to_csv(csv_filename, index=False)
print(f"\nResults saved to: {csv_filename}")
return jobs
print("No jobs found with current search parameters.")
return None
except Exception as e:
print(f"\nError during search:")
print(f"Error details: {str(e)}")
return None
if __name__ == "__main__":
print("Starting job search...")
jobs = search_tech_jobs(exclude_clearance=True)
if jobs is not None and not jobs.empty:
print("\nSearch completed successfully!")
print(f"Total jobs found: {len(jobs)}")
print("\nJobs by source:")
print(jobs['site'].value_counts())
else:
print("\nNo results found. Try adjusting search parameters.")

365
job_search_advanced.py Normal file
View File

@ -0,0 +1,365 @@
import csv
from jobspy import scrape_jobs
from datetime import datetime
import certifi
import time
from typing import Optional, List, Dict, Any, Union
import pandas as pd
import requests
import sys
from requests import Session
def fix_linkedin_url(url: str) -> str:
"""Fix incomplete LinkedIn URLs."""
if not url or 'linkedin' not in url:
return url
# If URL is truncated, try to reconstruct it
if url.startswith('https://www.linkedin') and '/jobs/view/' not in url:
# Extract the job ID if present
job_id = url.split('/')[-1] if url.split('/')[-1].isdigit() else None
if job_id:
return f"https://www.linkedin.com/jobs/view/{job_id}"
return url
def clean_job_data(jobs_df):
"""Clean and validate job data."""
# Fix LinkedIn URLs
jobs_df['job_url'] = jobs_df.apply(
lambda row: fix_linkedin_url(row['job_url']) if row['site'] == 'linkedin' else row['job_url'],
axis=1
)
# Remove rows with missing essential data
essential_columns = ['title', 'company', 'location', 'job_url']
jobs_df = jobs_df.dropna(subset=essential_columns)
# Clean up location data
jobs_df['location'] = jobs_df['location'].fillna('Location not specified')
# Ensure description exists
jobs_df['description'] = jobs_df['description'].fillna('No description available')
return jobs_df
def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
"""Filter out jobs requiring security clearance"""
clearance_keywords = [
'clearance', 'security clearance', 'secret', 'top secret',
'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
'public trust', 'security+', 'security plus'
]
# Create a pattern matching any clearance keyword
pattern = '|'.join(clearance_keywords)
# Filter out jobs where title or description contains clearance keywords
mask = ~(
df['title'].str.lower().str.contains(pattern, na=False) |
df['description'].str.lower().str.contains(pattern, na=False)
)
return df[mask]
def verify_proxy(proxy: str) -> bool:
"""Enhanced proxy verification"""
try:
# Check multiple IP verification services
verification_urls = [
'http://api.ipify.org?format=json',
'http://ip-api.com/json',
'http://ifconfig.me/ip'
]
# First check real IP (only first 3 digits for security)
real_ips = []
for url in verification_urls:
try:
response = requests.get(url, timeout=5)
if response.ok:
ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text)
real_ips.append(ip)
break
except:
continue
if not real_ips:
print("Could not verify real IP")
return False
real_ip = real_ips[0]
# Check with proxy
proxies = {
'http': proxy,
'https': proxy
}
# Configure session to handle SSL issues
session = requests.Session()
session.verify = False
requests.packages.urllib3.disable_warnings()
proxy_ips = []
for url in verification_urls:
try:
response = session.get(url, proxies=proxies, timeout=10)
if response.ok:
ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text)
proxy_ips.append(ip)
break
except:
continue
if not proxy_ips:
print("Could not verify proxy IP")
return False
proxy_ip = proxy_ips[0]
if real_ip != proxy_ip:
print(f"\nProxy verification successful!")
print(f"Real IP: {real_ip[:3]}... (hidden for security)")
print(f"Proxy IP: {proxy_ip}")
print(f"IP Verification Service: {url}")
return True
else:
print("\nWarning: Proxy not working - IPs match!")
return False
except Exception as e:
print(f"\nProxy verification failed: {str(e)}")
return False
def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]:
"""Verify proxy usage and return traffic stats"""
start_size = 0
response = session.get(url, stream=True)
content_size = len(response.content)
return {
"status_code": response.status_code,
"content_size": content_size,
"headers": dict(response.headers),
"proxy_used": bool(session.proxies)
}
def search_tech_jobs_with_proxies() -> Optional[pd.DataFrame]:
# Comprehensive search configuration
search_config = {
# Search parameters
'search_term': 'IT Engineer',
'location': 'Lone Tree, CO',
'distance': 25,
'results_wanted': 50,
'job_type': 'fulltime',
'hours_old': 72,
# Filter settings
'exclude_clearance': True,
'search_sites': ["indeed", "linkedin"],
# Proxy settings
'use_proxy': True, # Proxy kill switch
'proxy_list': [
"http://brd-customer-hl_92b00ed6-zone-residential_proxies_us:5t01plrkfs6y@brd.superproxy.io:33335",
"http://brd-customer-hl_92b00ed6-zone-residential_proxy2_us:uyfjctxhc8t4@brd.superproxy.io:33335"
],
# Clearance keywords to filter
'clearance_keywords': [
'clearance', 'security clearance', 'secret', 'top secret',
'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
'public trust', 'security+', 'security plus'
],
# Additional settings for better results
'max_retries_per_proxy': 2, # Number of retries per proxy
'verify_timeout': 15, # Timeout for proxy verification
'date_format': '%Y-%m-%d', # Standardize date format
'strict_location': True, # Enforce stricter location filtering
# Location verification
'location_center': {
'lat': 39.5486, # Lone Tree coordinates
'lon': -104.8719
},
'max_distance': 25, # miles
# Debug settings
'show_filtered_jobs': False, # Option to show filtered out jobs
'debug_mode': False, # Additional debugging information
'debug': {
'show_traffic': True,
'log_requests': True,
'show_proxy_usage': True
}
}
max_retries = 3
retry_count = 0
# Proxy verification and kill switch
if search_config['use_proxy']:
print("\nVerifying proxy configuration...")
proxy_verified = False
for proxy in search_config['proxy_list']:
if verify_proxy(proxy):
proxy_verified = True
break
if not proxy_verified:
print("\nNo working proxies found! Exiting for safety...")
sys.exit(1)
else:
print("\nWARNING: Running without proxy! This may result in IP blocking.")
user_input = input("Continue without proxy? (yes/no): ")
if user_input.lower() != 'yes':
print("Exiting...")
sys.exit(0)
while retry_count < max_retries:
current_proxy = search_config['proxy_list'][retry_count % len(search_config['proxy_list'])] if search_config['use_proxy'] else None
try:
print(f"\nAttempt {retry_count + 1} of {max_retries}")
if current_proxy:
print(f"Using proxy: {current_proxy}")
print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
print(f"Distance: {search_config['distance']} miles")
print(f"Job Type: {search_config['job_type']}")
print(f"Posts from last: {search_config['hours_old']} hours")
print(f"Excluding clearance jobs: {search_config['exclude_clearance']}")
print(f"Searching on: {', '.join(search_config['search_sites'])}")
jobs = scrape_jobs(
site_name=search_config['search_sites'],
search_term=search_config['search_term'],
location=search_config['location'],
distance=search_config['distance'],
results_wanted=search_config['results_wanted'],
job_type=search_config['job_type'],
hours_old=search_config['hours_old'],
country_indeed="USA",
description_format="markdown",
verbose=2,
proxy=current_proxy,
verify=False if current_proxy else certifi.where(), # Disable SSL verify when using proxy
)
if not isinstance(jobs, pd.DataFrame):
print("Invalid response format from job search.")
retry_count += 1
continue
if jobs.empty:
print("No jobs found with current search parameters.")
retry_count += 1
continue
print(f"\nInitial jobs found: {len(jobs)}")
# Track filtered jobs
filtered_jobs = {
'clearance': 0,
'location': 0,
'date': 0
}
if search_config['exclude_clearance']:
original_count = len(jobs)
pattern = '|'.join(search_config['clearance_keywords'])
clearance_mask = ~(
jobs['title'].str.lower().str.contains(pattern, na=False) |
jobs['description'].str.lower().str.contains(pattern, na=False)
)
filtered_jobs['clearance'] = original_count - len(jobs[clearance_mask])
jobs = jobs[clearance_mask]
# Fix date formatting
jobs['date_posted'] = pd.to_datetime(jobs['date_posted'], errors='coerce')
date_mask = jobs['date_posted'].notna()
filtered_jobs['date'] = len(jobs) - len(jobs[date_mask])
jobs = jobs[date_mask]
# Location filtering
if search_config['strict_location']:
location_mask = jobs['location'].apply(
lambda x: is_within_radius(x,
search_config['location_center'],
search_config['max_distance'])
)
filtered_jobs['location'] = len(jobs) - len(jobs[location_mask])
jobs = jobs[location_mask]
# Print filtering summary
print("\nFiltering Summary:")
for reason, count in filtered_jobs.items():
if count > 0:
print(f"Removed {count} jobs due to {reason}")
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"it_jobs_{timestamp}.csv"
# Print job summary
print("\nJob Listings Found:")
print("-------------------")
for idx, job in jobs.iterrows():
print(f"\n{idx + 1}. {job.get('title', 'No title')}")
print(f" Company: {job.get('company', 'No company')}")
print(f" Location: {job.get('location', 'No location')}")
print(f" Source: {job.get('site', 'No source')}")
print(f" Date Posted: {job.get('date_posted', 'No date')}")
# Save to CSV
jobs.to_csv(
csv_filename,
quoting=csv.QUOTE_NONNUMERIC,
escapechar="\\",
index=False
)
print(f"\nResults saved to: {csv_filename}")
return jobs
except Exception as e:
print(f"\nError with proxy {current_proxy}:")
print(f"Error details: {str(e)}")
retry_count += 1
if retry_count < max_retries:
wait_time = 5 * (retry_count)
print(f"\nWaiting {wait_time} seconds before trying next proxy...")
time.sleep(wait_time)
else:
print("\nAll attempts failed. Please try again later.")
return None
def calculate_distance(job_location, search_location):
"""
Placeholder for distance calculation.
In a full implementation, this would use geocoding and actual distance calculation.
"""
return "Unknown" # Would need geocoding API to calculate actual distances
def is_within_radius(job_location: str, center: dict, max_distance: int) -> bool:
"""Verify if job location is within specified radius"""
try:
# Add geocoding logic here if needed
return True # Placeholder for now
except Exception:
return False
if __name__ == "__main__":
print("Starting job search...")
jobs = search_tech_jobs_with_proxies()
if jobs is not None and not jobs.empty:
print("\nSearch completed successfully!")
print(f"Total jobs found: {len(jobs)}")
print("\nJobs by source:")
print(jobs['site'].value_counts())
else:
print("\nNo results found. Try adjusting search parameters.")

88
proxy_utils.py Normal file
View File

@ -0,0 +1,88 @@
from typing import Dict, Any, Optional
from requests import Session, Response
import requests
import warnings
import urllib3
# Suppress SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def verify_proxy(proxy: str, verification_urls: list) -> bool:
"""Verify proxy is working and hiding the real IP"""
try:
# First check real IP
real_ip = get_real_ip(verification_urls)
if not real_ip:
print("Could not verify real IP")
return False
proxy_ip = get_proxy_ip(proxy, verification_urls)
if not proxy_ip:
print("Could not verify proxy IP")
return False
if real_ip != proxy_ip:
print(f"\nProxy verification successful!")
print(f"Real IP: {real_ip[:3]}... (hidden for security)")
print(f"Proxy IP: {proxy_ip}")
return True
else:
print("\nWarning: Proxy not working - IPs match!")
return False
except Exception as e:
print(f"\nProxy verification failed: {str(e)}")
return False
def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]:
"""Verify proxy usage and return traffic stats"""
try:
response = session.get(url, stream=True)
content_size = len(response.content)
return {
"status_code": response.status_code,
"content_size": content_size,
"headers": dict(response.headers),
"proxy_used": bool(session.proxies)
}
except Exception as e:
print(f"Error tracking proxy usage: {str(e)}")
return {
"status_code": 0,
"content_size": 0,
"headers": {},
"proxy_used": False
}
def get_real_ip(verification_urls: list) -> Optional[str]:
"""Get real IP address without proxy"""
for url in verification_urls:
try:
response = requests.get(url, timeout=5)
if response.ok:
return extract_ip(response, url)
except:
continue
return None
def get_proxy_ip(proxy: str, verification_urls: list) -> Optional[str]:
"""Get IP address when using proxy"""
proxies = {'http': proxy, 'https': proxy}
session = requests.Session()
session.verify = False
for url in verification_urls:
try:
response = session.get(url, proxies=proxies, timeout=10)
if response.ok:
return extract_ip(response, url)
except:
continue
return None
def extract_ip(response: Response, url: str) -> str:
"""Extract IP from response based on service used"""
if 'ifconfig.me' in url:
return response.text
return response.json().get('ip', response.text)

46
setup_config.py Normal file
View File

@ -0,0 +1,46 @@
"""
Helper script to set up configuration files
"""
import os
import shutil
from getpass import getpass
def setup_config():
# Check if config_sensitive.py already exists
if os.path.exists('config_sensitive.py'):
overwrite = input("config_sensitive.py already exists. Overwrite? (yes/no): ")
if overwrite.lower() != 'yes':
print("Setup cancelled.")
return
# Copy template
shutil.copy2('config_sensitive_template.py', 'config_sensitive.py')
# Get proxy configuration
use_proxy = input("Do you want to use proxies? (yes/no): ").lower() == 'yes'
if use_proxy:
proxy_url = input("Enter proxy URL (format: http://host:port): ")
username = input("Proxy username: ")
password = getpass("Proxy password: ")
# Create proxy string
proxy = f"http://{username}:{password}@{proxy_url.split('//')[1]}"
# Update config file
with open('config_sensitive.py', 'r') as f:
content = f.read()
content = content.replace(
'"http://your-username:your-password@your-proxy-host:port"',
f'"{proxy}"'
)
with open('config_sensitive.py', 'w') as f:
f.write(content)
print("\nConfiguration file created successfully!")
print("Remember to add config_sensitive.py to .gitignore")
if __name__ == "__main__":
setup_config()