Add complete job search functionality with proxy support and security improvements

2025-02-04 17:05:23 -07:00 · 2025-02-04 17:05:23 -07:00 · 144ede9e59
parent 333e9e6760
commit 144ede9e59
7 changed files with 717 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,37 @@
-/venv/
+# Sensitive configuration
-/.idea
+config_sensitive.py
-**/__pycache__/
+
-**/.pytest_cache/
+# Generated files
-/.ipynb_checkpoints/
+*.csv
-**/output/
+*.log
-**/.DS_Store
+
-*.pyc
+# Python
-.env
+__pycache__/
-dist
+*.py[cod]
 *$py.class
 .Python
 env/
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual Environment
 venv/
 ENV/
 # IDE
 .idea/
 .vscode/
 *.swp
 *.swo
--- a/config.py
+++ b/config.py
@ -0,0 +1,23 @@
 # Default configuration that can be committed
 DEFAULT_CONFIG = {
    'search_term': 'IT Engineer',
    'location': 'Lone Tree, CO',
    'distance': 25,
    'results_wanted': 50,
    'job_type': 'fulltime',
    'hours_old': 72,
    'search_sites': ["indeed", "linkedin"],
    'exclude_clearance': True,
    'clearance_keywords': [
        'clearance', 'security clearance', 'secret', 'top secret', 
        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
        'public trust', 'security+', 'security plus'
    ]
 }
 try:
    # Try to import sensitive config from a local file
    from .config_sensitive import SENSITIVE_CONFIG
 except ImportError:
    print("Warning: No sensitive configuration found. Using defaults.") 
--- a/config_sensitive_template.py
+++ b/config_sensitive_template.py
@ -0,0 +1,49 @@
 """
 JobSpy Sensitive Configuration Template
 =====================================
 Setup Instructions:
 1. Copy this file to 'config_sensitive.py'
 2. Fill in your actual values
 3. Keep config_sensitive.py in .gitignore
 Security Best Practices:
 - Never commit config_sensitive.py to version control
 - Store proxy credentials securely
 - Rotate credentials regularly
 - Use environment variables when possible
 """
 SENSITIVE_CONFIG = {
    'proxy_enabled': True,  # Set to False to disable proxy usage
    # Add your proxy URLs here (at least one required if proxy_enabled is True)
    'proxy_list': [
        "http://your-username:your-password@your-proxy-host:port",
        "http://your-backup-proxy-url:port"  # Optional backup proxy
    ],
    # IP verification services (can be customized)
    'proxy_verification_urls': [
        'http://api.ipify.org?format=json',
        'http://ip-api.com/json',
        'http://ifconfig.me/ip'
    ],
    # Advanced Settings
    'proxy_timeout': 10,        # Seconds to wait for proxy response
    'max_retries': 3,          # Maximum retry attempts per proxy
    'rotate_interval': 100,    # Rotate proxy after N requests
    'verify_ssl': False        # Disable for some proxy configurations
 }
 """
 Example format for proxy_list entries:
 - Bright Data format: "http://brd-customer-[username]-zone-[zone_name]:[password]@brd.superproxy.io:22225"
 - Generic format: "http://username:password@host:port"
 Security Notes:
 1. Never commit config_sensitive.py to version control
 2. Keep your proxy credentials secure
 3. Regularly rotate proxy credentials if possible
 """ 
--- a/job_search.py
+++ b/job_search.py
@ -0,0 +1,109 @@
 from jobspy import scrape_jobs
 import time
 import certifi
 import pandas as pd
 import csv
 from datetime import datetime
 from typing import Optional, List
 def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
    """Filter out jobs requiring security clearance"""
    clearance_keywords = [
        'clearance', 'security clearance', 'secret', 'top secret', 
        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
        'public trust', 'security+', 'security plus'
    ]
    # Create a pattern matching any clearance keyword
    pattern = '|'.join(clearance_keywords)
    # Filter out jobs where title or description contains clearance keywords
    mask = ~(
        df['title'].str.lower().str.contains(pattern, na=False) |
        df['description'].str.lower().str.contains(pattern, na=False)
    )
    return df[mask]
 def search_tech_jobs(
    search_sites: List[str] = ["indeed", "linkedin"],
    exclude_clearance: bool = False
 ) -> Optional[pd.DataFrame]:
    # Search configuration
    search_config = {
        'search_term': 'IT Engineer',
        'location': 'Lone Tree, CO',
        'distance': 25,
        'results_wanted': 50,
        'job_type': 'fulltime',
        'hours_old': 72
    }
    try:
        print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
        print(f"Distance: {search_config['distance']} miles")
        print(f"Job Type: {search_config['job_type']}")
        print(f"Posts from last: {search_config['hours_old']} hours")
        print(f"Excluding clearance jobs: {exclude_clearance}")
        print(f"Searching on: {', '.join(search_sites)}")
        jobs = scrape_jobs(
            site_name=search_sites,
            search_term=search_config['search_term'],
            location=search_config['location'],
            distance=search_config['distance'],
            results_wanted=search_config['results_wanted'],
            job_type=search_config['job_type'],
            hours_old=search_config['hours_old'],
            country_indeed="USA",
            description_format="markdown",
            verbose=2
        )
        if isinstance(jobs, pd.DataFrame) and not jobs.empty:
            print(f"\nInitial jobs found: {len(jobs)}")
            if exclude_clearance:
                original_count = len(jobs)
                jobs = filter_clearance_jobs(jobs)
                filtered_count = len(jobs)
                print(f"Removed {original_count - filtered_count} jobs requiring clearance")
            # Save results
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            csv_filename = f"it_jobs_{timestamp}.csv"
            # Print job summary
            print("\nJob Listings Found:")
            print("-------------------")
            for idx, job in jobs.iterrows():
                print(f"\n{idx + 1}. {job.get('title', 'No title')}")
                print(f"   Company: {job.get('company', 'No company')}")
                print(f"   Location: {job.get('location', 'No location')}")
                print(f"   Source: {job.get('site', 'No source')}")
                print(f"   Date Posted: {job.get('date_posted', 'No date')}")
            jobs.to_csv(csv_filename, index=False)
            print(f"\nResults saved to: {csv_filename}")
            return jobs
        print("No jobs found with current search parameters.")
        return None
    except Exception as e:
        print(f"\nError during search:")
        print(f"Error details: {str(e)}")
        return None
 if __name__ == "__main__":
    print("Starting job search...")
    jobs = search_tech_jobs(exclude_clearance=True)
    if jobs is not None and not jobs.empty:
        print("\nSearch completed successfully!")
        print(f"Total jobs found: {len(jobs)}")
        print("\nJobs by source:")
        print(jobs['site'].value_counts())
    else:
        print("\nNo results found. Try adjusting search parameters.")
--- a/job_search_advanced.py
+++ b/job_search_advanced.py
@ -0,0 +1,365 @@
 import csv
 from jobspy import scrape_jobs
 from datetime import datetime
 import certifi
 import time
 from typing import Optional, List, Dict, Any, Union
 import pandas as pd
 import requests
 import sys
 from requests import Session
 def fix_linkedin_url(url: str) -> str:
    """Fix incomplete LinkedIn URLs."""
    if not url or 'linkedin' not in url:
        return url
    # If URL is truncated, try to reconstruct it
    if url.startswith('https://www.linkedin') and '/jobs/view/' not in url:
        # Extract the job ID if present
        job_id = url.split('/')[-1] if url.split('/')[-1].isdigit() else None
        if job_id:
            return f"https://www.linkedin.com/jobs/view/{job_id}"
    return url
 def clean_job_data(jobs_df):
    """Clean and validate job data."""
    # Fix LinkedIn URLs
    jobs_df['job_url'] = jobs_df.apply(
        lambda row: fix_linkedin_url(row['job_url']) if row['site'] == 'linkedin' else row['job_url'],
        axis=1
    )
    # Remove rows with missing essential data
    essential_columns = ['title', 'company', 'location', 'job_url']
    jobs_df = jobs_df.dropna(subset=essential_columns)
    # Clean up location data
    jobs_df['location'] = jobs_df['location'].fillna('Location not specified')
    # Ensure description exists
    jobs_df['description'] = jobs_df['description'].fillna('No description available')
    return jobs_df
 def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
    """Filter out jobs requiring security clearance"""
    clearance_keywords = [
        'clearance', 'security clearance', 'secret', 'top secret', 
        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
        'public trust', 'security+', 'security plus'
    ]
    # Create a pattern matching any clearance keyword
    pattern = '|'.join(clearance_keywords)
    # Filter out jobs where title or description contains clearance keywords
    mask = ~(
        df['title'].str.lower().str.contains(pattern, na=False) |
        df['description'].str.lower().str.contains(pattern, na=False)
    )
    return df[mask]
 def verify_proxy(proxy: str) -> bool:
    """Enhanced proxy verification"""
    try:
        # Check multiple IP verification services
        verification_urls = [
            'http://api.ipify.org?format=json',
            'http://ip-api.com/json',
            'http://ifconfig.me/ip'
        ]
        # First check real IP (only first 3 digits for security)
        real_ips = []
        for url in verification_urls:
            try:
                response = requests.get(url, timeout=5)
                if response.ok:
                    ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text)
                    real_ips.append(ip)
                    break
            except:
                continue
        if not real_ips:
            print("Could not verify real IP")
            return False
        real_ip = real_ips[0]
        # Check with proxy
        proxies = {
            'http': proxy,
            'https': proxy
        }
        # Configure session to handle SSL issues
        session = requests.Session()
        session.verify = False
        requests.packages.urllib3.disable_warnings()
        proxy_ips = []
        for url in verification_urls:
            try:
                response = session.get(url, proxies=proxies, timeout=10)
                if response.ok:
                    ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text)
                    proxy_ips.append(ip)
                    break
            except:
                continue
        if not proxy_ips:
            print("Could not verify proxy IP")
            return False
        proxy_ip = proxy_ips[0]
        if real_ip != proxy_ip:
            print(f"\nProxy verification successful!")
            print(f"Real IP: {real_ip[:3]}... (hidden for security)")
            print(f"Proxy IP: {proxy_ip}")
            print(f"IP Verification Service: {url}")
            return True
        else:
            print("\nWarning: Proxy not working - IPs match!")
            return False
    except Exception as e:
        print(f"\nProxy verification failed: {str(e)}")
        return False
 def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]:
    """Verify proxy usage and return traffic stats"""
    start_size = 0
    response = session.get(url, stream=True)
    content_size = len(response.content)
    return {
        "status_code": response.status_code,
        "content_size": content_size,
        "headers": dict(response.headers),
        "proxy_used": bool(session.proxies)
    }
 def search_tech_jobs_with_proxies() -> Optional[pd.DataFrame]:
    # Comprehensive search configuration
    search_config = {
        # Search parameters
        'search_term': 'IT Engineer',
        'location': 'Lone Tree, CO',
        'distance': 25,
        'results_wanted': 50,
        'job_type': 'fulltime',
        'hours_old': 72,
        # Filter settings
        'exclude_clearance': True,
        'search_sites': ["indeed", "linkedin"],
        # Proxy settings
        'use_proxy': True,  # Proxy kill switch
        'proxy_list': [
            "http://brd-customer-hl_92b00ed6-zone-residential_proxies_us:5t01plrkfs6y@brd.superproxy.io:33335",
            "http://brd-customer-hl_92b00ed6-zone-residential_proxy2_us:uyfjctxhc8t4@brd.superproxy.io:33335"
        ],
        # Clearance keywords to filter
        'clearance_keywords': [
            'clearance', 'security clearance', 'secret', 'top secret', 
            'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
            'public trust', 'security+', 'security plus'
        ],
        # Additional settings for better results
        'max_retries_per_proxy': 2,    # Number of retries per proxy
        'verify_timeout': 15,          # Timeout for proxy verification
        'date_format': '%Y-%m-%d',     # Standardize date format
        'strict_location': True,       # Enforce stricter location filtering
        # Location verification
        'location_center': {
            'lat': 39.5486,  # Lone Tree coordinates
            'lon': -104.8719
        },
        'max_distance': 25,  # miles
        # Debug settings
        'show_filtered_jobs': False,   # Option to show filtered out jobs
        'debug_mode': False,           # Additional debugging information
        'debug': {
            'show_traffic': True,
            'log_requests': True,
            'show_proxy_usage': True
        }
    }
    max_retries = 3
    retry_count = 0
    # Proxy verification and kill switch
    if search_config['use_proxy']:
        print("\nVerifying proxy configuration...")
        proxy_verified = False
        for proxy in search_config['proxy_list']:
            if verify_proxy(proxy):
                proxy_verified = True
                break
        if not proxy_verified:
            print("\nNo working proxies found! Exiting for safety...")
            sys.exit(1)
    else:
        print("\nWARNING: Running without proxy! This may result in IP blocking.")
        user_input = input("Continue without proxy? (yes/no): ")
        if user_input.lower() != 'yes':
            print("Exiting...")
            sys.exit(0)
    while retry_count < max_retries:
        current_proxy = search_config['proxy_list'][retry_count % len(search_config['proxy_list'])] if search_config['use_proxy'] else None
        try:
            print(f"\nAttempt {retry_count + 1} of {max_retries}")
            if current_proxy:
                print(f"Using proxy: {current_proxy}")
            print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
            print(f"Distance: {search_config['distance']} miles")
            print(f"Job Type: {search_config['job_type']}")
            print(f"Posts from last: {search_config['hours_old']} hours")
            print(f"Excluding clearance jobs: {search_config['exclude_clearance']}")
            print(f"Searching on: {', '.join(search_config['search_sites'])}")
            jobs = scrape_jobs(
                site_name=search_config['search_sites'],
                search_term=search_config['search_term'],
                location=search_config['location'],
                distance=search_config['distance'],
                results_wanted=search_config['results_wanted'],
                job_type=search_config['job_type'],
                hours_old=search_config['hours_old'],
                country_indeed="USA",
                description_format="markdown",
                verbose=2,
                proxy=current_proxy,
                verify=False if current_proxy else certifi.where(),  # Disable SSL verify when using proxy
            )
            if not isinstance(jobs, pd.DataFrame):
                print("Invalid response format from job search.")
                retry_count += 1
                continue
            if jobs.empty:
                print("No jobs found with current search parameters.")
                retry_count += 1
                continue
            print(f"\nInitial jobs found: {len(jobs)}")
            # Track filtered jobs
            filtered_jobs = {
                'clearance': 0,
                'location': 0,
                'date': 0
            }
            if search_config['exclude_clearance']:
                original_count = len(jobs)
                pattern = '|'.join(search_config['clearance_keywords'])
                clearance_mask = ~(
                    jobs['title'].str.lower().str.contains(pattern, na=False) |
                    jobs['description'].str.lower().str.contains(pattern, na=False)
                )
                filtered_jobs['clearance'] = original_count - len(jobs[clearance_mask])
                jobs = jobs[clearance_mask]
            # Fix date formatting
            jobs['date_posted'] = pd.to_datetime(jobs['date_posted'], errors='coerce')
            date_mask = jobs['date_posted'].notna()
            filtered_jobs['date'] = len(jobs) - len(jobs[date_mask])
            jobs = jobs[date_mask]
            # Location filtering
            if search_config['strict_location']:
                location_mask = jobs['location'].apply(
                    lambda x: is_within_radius(x, 
                                             search_config['location_center'],
                                             search_config['max_distance'])
                )
                filtered_jobs['location'] = len(jobs) - len(jobs[location_mask])
                jobs = jobs[location_mask]
            # Print filtering summary
            print("\nFiltering Summary:")
            for reason, count in filtered_jobs.items():
                if count > 0:
                    print(f"Removed {count} jobs due to {reason}")
            # Save results
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            csv_filename = f"it_jobs_{timestamp}.csv"
            # Print job summary
            print("\nJob Listings Found:")
            print("-------------------")
            for idx, job in jobs.iterrows():
                print(f"\n{idx + 1}. {job.get('title', 'No title')}")
                print(f"   Company: {job.get('company', 'No company')}")
                print(f"   Location: {job.get('location', 'No location')}")
                print(f"   Source: {job.get('site', 'No source')}")
                print(f"   Date Posted: {job.get('date_posted', 'No date')}")
            # Save to CSV
            jobs.to_csv(
                csv_filename,
                quoting=csv.QUOTE_NONNUMERIC,
                escapechar="\\",
                index=False
            )
            print(f"\nResults saved to: {csv_filename}")
            return jobs
        except Exception as e:
            print(f"\nError with proxy {current_proxy}:")
            print(f"Error details: {str(e)}")
            retry_count += 1
            if retry_count < max_retries:
                wait_time = 5 * (retry_count)
                print(f"\nWaiting {wait_time} seconds before trying next proxy...")
                time.sleep(wait_time)
            else:
                print("\nAll attempts failed. Please try again later.")
    return None
 def calculate_distance(job_location, search_location):
    """
    Placeholder for distance calculation.
    In a full implementation, this would use geocoding and actual distance calculation.
    """
    return "Unknown"  # Would need geocoding API to calculate actual distances
 def is_within_radius(job_location: str, center: dict, max_distance: int) -> bool:
    """Verify if job location is within specified radius"""
    try:
        # Add geocoding logic here if needed
        return True  # Placeholder for now
    except Exception:
        return False
 if __name__ == "__main__":
    print("Starting job search...")
    jobs = search_tech_jobs_with_proxies()
    if jobs is not None and not jobs.empty:
        print("\nSearch completed successfully!")
        print(f"Total jobs found: {len(jobs)}")
        print("\nJobs by source:")
        print(jobs['site'].value_counts())
    else:
        print("\nNo results found. Try adjusting search parameters.")
--- a/proxy_utils.py
+++ b/proxy_utils.py
@ -0,0 +1,88 @@
 from typing import Dict, Any, Optional
 from requests import Session, Response
 import requests
 import warnings
 import urllib3
 # Suppress SSL warnings
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 def verify_proxy(proxy: str, verification_urls: list) -> bool:
    """Verify proxy is working and hiding the real IP"""
    try:
        # First check real IP
        real_ip = get_real_ip(verification_urls)
        if not real_ip:
            print("Could not verify real IP")
            return False
        proxy_ip = get_proxy_ip(proxy, verification_urls)
        if not proxy_ip:
            print("Could not verify proxy IP")
            return False
        if real_ip != proxy_ip:
            print(f"\nProxy verification successful!")
            print(f"Real IP: {real_ip[:3]}... (hidden for security)")
            print(f"Proxy IP: {proxy_ip}")
            return True
        else:
            print("\nWarning: Proxy not working - IPs match!")
            return False
    except Exception as e:
        print(f"\nProxy verification failed: {str(e)}")
        return False
 def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]:
    """Verify proxy usage and return traffic stats"""
    try:
        response = session.get(url, stream=True)
        content_size = len(response.content)
        return {
            "status_code": response.status_code,
            "content_size": content_size,
            "headers": dict(response.headers),
            "proxy_used": bool(session.proxies)
        }
    except Exception as e:
        print(f"Error tracking proxy usage: {str(e)}")
        return {
            "status_code": 0,
            "content_size": 0,
            "headers": {},
            "proxy_used": False
        }
 def get_real_ip(verification_urls: list) -> Optional[str]:
    """Get real IP address without proxy"""
    for url in verification_urls:
        try:
            response = requests.get(url, timeout=5)
            if response.ok:
                return extract_ip(response, url)
        except:
            continue
    return None
 def get_proxy_ip(proxy: str, verification_urls: list) -> Optional[str]:
    """Get IP address when using proxy"""
    proxies = {'http': proxy, 'https': proxy}
    session = requests.Session()
    session.verify = False
    for url in verification_urls:
        try:
            response = session.get(url, proxies=proxies, timeout=10)
            if response.ok:
                return extract_ip(response, url)
        except:
            continue
    return None
 def extract_ip(response: Response, url: str) -> str:
    """Extract IP from response based on service used"""
    if 'ifconfig.me' in url:
        return response.text
    return response.json().get('ip', response.text) 
--- a/setup_config.py
+++ b/setup_config.py
@ -0,0 +1,46 @@
 """
 Helper script to set up configuration files
 """
 import os
 import shutil
 from getpass import getpass
 def setup_config():
    # Check if config_sensitive.py already exists
    if os.path.exists('config_sensitive.py'):
        overwrite = input("config_sensitive.py already exists. Overwrite? (yes/no): ")
        if overwrite.lower() != 'yes':
            print("Setup cancelled.")
            return
    # Copy template
    shutil.copy2('config_sensitive_template.py', 'config_sensitive.py')
    # Get proxy configuration
    use_proxy = input("Do you want to use proxies? (yes/no): ").lower() == 'yes'
    if use_proxy:
        proxy_url = input("Enter proxy URL (format: http://host:port): ")
        username = input("Proxy username: ")
        password = getpass("Proxy password: ")
        # Create proxy string
        proxy = f"http://{username}:{password}@{proxy_url.split('//')[1]}"
        # Update config file
        with open('config_sensitive.py', 'r') as f:
            content = f.read()
        content = content.replace(
            '"http://your-username:your-password@your-proxy-host:port"',
            f'"{proxy}"'
        )
        with open('config_sensitive.py', 'w') as f:
            f.write(content)
    print("\nConfiguration file created successfully!")
    print("Remember to add config_sensitive.py to .gitignore")
 if __name__ == "__main__":
    setup_config()