Add complete job search functionality with proxy support and security improvements

2025-02-04 17:05:23 -07:00 · 2025-02-04 17:05:23 -07:00 · 144ede9e59
parent 333e9e6760
commit 144ede9e59
7 changed files with 717 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,37 @@
-/venv/
-/.idea
-**/__pycache__/
-**/.pytest_cache/
-/.ipynb_checkpoints/
-**/output/
-**/.DS_Store
-*.pyc
-.env
-dist
+# Sensitive configuration
+config_sensitive.py
+
+# Generated files
+*.csv
+*.log
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
--- a/config.py
+++ b/config.py
@ -0,0 +1,23 @@
+# Default configuration that can be committed
+DEFAULT_CONFIG = {
+    'search_term': 'IT Engineer',
+    'location': 'Lone Tree, CO',
+    'distance': 25,
+    'results_wanted': 50,
+    'job_type': 'fulltime',
+    'hours_old': 72,
+    'search_sites': ["indeed", "linkedin"],
+    'exclude_clearance': True,
+    'clearance_keywords': [
+        'clearance', 'security clearance', 'secret', 'top secret', 
+        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
+        'public trust', 'security+', 'security plus'
+    ]
+}
+
+
+try:
+    # Try to import sensitive config from a local file
+    from .config_sensitive import SENSITIVE_CONFIG
+except ImportError:
+    print("Warning: No sensitive configuration found. Using defaults.") 
--- a/config_sensitive_template.py
+++ b/config_sensitive_template.py
@ -0,0 +1,49 @@
+"""
+JobSpy Sensitive Configuration Template
+=====================================
+
+Setup Instructions:
+1. Copy this file to 'config_sensitive.py'
+2. Fill in your actual values
+3. Keep config_sensitive.py in .gitignore
+
+Security Best Practices:
+- Never commit config_sensitive.py to version control
+- Store proxy credentials securely
+- Rotate credentials regularly
+- Use environment variables when possible
+"""
+
+SENSITIVE_CONFIG = {
+    'proxy_enabled': True,  # Set to False to disable proxy usage
+    
+    # Add your proxy URLs here (at least one required if proxy_enabled is True)
+    'proxy_list': [
+        "http://your-username:your-password@your-proxy-host:port",
+        "http://your-backup-proxy-url:port"  # Optional backup proxy
+    ],
+    
+    # IP verification services (can be customized)
+    'proxy_verification_urls': [
+        'http://api.ipify.org?format=json',
+        'http://ip-api.com/json',
+        'http://ifconfig.me/ip'
+    ],
+    
+    # Advanced Settings
+    'proxy_timeout': 10,        # Seconds to wait for proxy response
+    'max_retries': 3,          # Maximum retry attempts per proxy
+    'rotate_interval': 100,    # Rotate proxy after N requests
+    'verify_ssl': False        # Disable for some proxy configurations
+}
+
+"""
+Example format for proxy_list entries:
+- Bright Data format: "http://brd-customer-[username]-zone-[zone_name]:[password]@brd.superproxy.io:22225"
+- Generic format: "http://username:password@host:port"
+
+Security Notes:
+1. Never commit config_sensitive.py to version control
+2. Keep your proxy credentials secure
+3. Regularly rotate proxy credentials if possible
+""" 
--- a/job_search.py
+++ b/job_search.py
@ -0,0 +1,109 @@
+from jobspy import scrape_jobs
+import time
+import certifi
+import pandas as pd
+import csv
+from datetime import datetime
+from typing import Optional, List
+
+def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
+    """Filter out jobs requiring security clearance"""
+    clearance_keywords = [
+        'clearance', 'security clearance', 'secret', 'top secret', 
+        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
+        'public trust', 'security+', 'security plus'
+    ]
+    
+    # Create a pattern matching any clearance keyword
+    pattern = '|'.join(clearance_keywords)
+    
+    # Filter out jobs where title or description contains clearance keywords
+    mask = ~(
+        df['title'].str.lower().str.contains(pattern, na=False) |
+        df['description'].str.lower().str.contains(pattern, na=False)
+    )
+    
+    return df[mask]
+
+def search_tech_jobs(
+    search_sites: List[str] = ["indeed", "linkedin"],
+    exclude_clearance: bool = False
+) -> Optional[pd.DataFrame]:
+    
+    # Search configuration
+    search_config = {
+        'search_term': 'IT Engineer',
+        'location': 'Lone Tree, CO',
+        'distance': 25,
+        'results_wanted': 50,
+        'job_type': 'fulltime',
+        'hours_old': 72
+    }
+
+    try:
+        print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
+        print(f"Distance: {search_config['distance']} miles")
+        print(f"Job Type: {search_config['job_type']}")
+        print(f"Posts from last: {search_config['hours_old']} hours")
+        print(f"Excluding clearance jobs: {exclude_clearance}")
+        print(f"Searching on: {', '.join(search_sites)}")
+
+        jobs = scrape_jobs(
+            site_name=search_sites,
+            search_term=search_config['search_term'],
+            location=search_config['location'],
+            distance=search_config['distance'],
+            results_wanted=search_config['results_wanted'],
+            job_type=search_config['job_type'],
+            hours_old=search_config['hours_old'],
+            country_indeed="USA",
+            description_format="markdown",
+            verbose=2
+        )
+
+        if isinstance(jobs, pd.DataFrame) and not jobs.empty:
+            print(f"\nInitial jobs found: {len(jobs)}")
+            
+            if exclude_clearance:
+                original_count = len(jobs)
+                jobs = filter_clearance_jobs(jobs)
+                filtered_count = len(jobs)
+                print(f"Removed {original_count - filtered_count} jobs requiring clearance")
+            
+            # Save results
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            csv_filename = f"it_jobs_{timestamp}.csv"
+            
+            # Print job summary
+            print("\nJob Listings Found:")
+            print("-------------------")
+            for idx, job in jobs.iterrows():
+                print(f"\n{idx + 1}. {job.get('title', 'No title')}")
+                print(f"   Company: {job.get('company', 'No company')}")
+                print(f"   Location: {job.get('location', 'No location')}")
+                print(f"   Source: {job.get('site', 'No source')}")
+                print(f"   Date Posted: {job.get('date_posted', 'No date')}")
+            
+            jobs.to_csv(csv_filename, index=False)
+            print(f"\nResults saved to: {csv_filename}")
+            return jobs
+            
+        print("No jobs found with current search parameters.")
+        return None
+
+    except Exception as e:
+        print(f"\nError during search:")
+        print(f"Error details: {str(e)}")
+        return None
+
+if __name__ == "__main__":
+    print("Starting job search...")
+    jobs = search_tech_jobs(exclude_clearance=True)
+    
+    if jobs is not None and not jobs.empty:
+        print("\nSearch completed successfully!")
+        print(f"Total jobs found: {len(jobs)}")
+        print("\nJobs by source:")
+        print(jobs['site'].value_counts())
+    else:
+        print("\nNo results found. Try adjusting search parameters.")
--- a/job_search_advanced.py
+++ b/job_search_advanced.py
@ -0,0 +1,365 @@
+import csv
+from jobspy import scrape_jobs
+from datetime import datetime
+import certifi
+import time
+from typing import Optional, List, Dict, Any, Union
+import pandas as pd
+import requests
+import sys
+from requests import Session
+
+def fix_linkedin_url(url: str) -> str:
+    """Fix incomplete LinkedIn URLs."""
+    if not url or 'linkedin' not in url:
+        return url
+    
+    # If URL is truncated, try to reconstruct it
+    if url.startswith('https://www.linkedin') and '/jobs/view/' not in url:
+        # Extract the job ID if present
+        job_id = url.split('/')[-1] if url.split('/')[-1].isdigit() else None
+        if job_id:
+            return f"https://www.linkedin.com/jobs/view/{job_id}"
+    return url
+
+def clean_job_data(jobs_df):
+    """Clean and validate job data."""
+    # Fix LinkedIn URLs
+    jobs_df['job_url'] = jobs_df.apply(
+        lambda row: fix_linkedin_url(row['job_url']) if row['site'] == 'linkedin' else row['job_url'],
+        axis=1
+    )
+    
+    # Remove rows with missing essential data
+    essential_columns = ['title', 'company', 'location', 'job_url']
+    jobs_df = jobs_df.dropna(subset=essential_columns)
+    
+    # Clean up location data
+    jobs_df['location'] = jobs_df['location'].fillna('Location not specified')
+    
+    # Ensure description exists
+    jobs_df['description'] = jobs_df['description'].fillna('No description available')
+    
+    return jobs_df
+
+def filter_clearance_jobs(df: pd.DataFrame) -> pd.DataFrame:
+    """Filter out jobs requiring security clearance"""
+    clearance_keywords = [
+        'clearance', 'security clearance', 'secret', 'top secret', 
+        'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
+        'public trust', 'security+', 'security plus'
+    ]
+    
+    # Create a pattern matching any clearance keyword
+    pattern = '|'.join(clearance_keywords)
+    
+    # Filter out jobs where title or description contains clearance keywords
+    mask = ~(
+        df['title'].str.lower().str.contains(pattern, na=False) |
+        df['description'].str.lower().str.contains(pattern, na=False)
+    )
+    
+    return df[mask]
+
+def verify_proxy(proxy: str) -> bool:
+    """Enhanced proxy verification"""
+    try:
+        # Check multiple IP verification services
+        verification_urls = [
+            'http://api.ipify.org?format=json',
+            'http://ip-api.com/json',
+            'http://ifconfig.me/ip'
+        ]
+        
+        # First check real IP (only first 3 digits for security)
+        real_ips = []
+        for url in verification_urls:
+            try:
+                response = requests.get(url, timeout=5)
+                if response.ok:
+                    ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text)
+                    real_ips.append(ip)
+                    break
+            except:
+                continue
+                
+        if not real_ips:
+            print("Could not verify real IP")
+            return False
+            
+        real_ip = real_ips[0]
+        
+        # Check with proxy
+        proxies = {
+            'http': proxy,
+            'https': proxy
+        }
+        
+        # Configure session to handle SSL issues
+        session = requests.Session()
+        session.verify = False
+        requests.packages.urllib3.disable_warnings()
+        
+        proxy_ips = []
+        for url in verification_urls:
+            try:
+                response = session.get(url, proxies=proxies, timeout=10)
+                if response.ok:
+                    ip = response.text if 'ifconfig' in url else response.json().get('ip', response.text)
+                    proxy_ips.append(ip)
+                    break
+            except:
+                continue
+                
+        if not proxy_ips:
+            print("Could not verify proxy IP")
+            return False
+            
+        proxy_ip = proxy_ips[0]
+        
+        if real_ip != proxy_ip:
+            print(f"\nProxy verification successful!")
+            print(f"Real IP: {real_ip[:3]}... (hidden for security)")
+            print(f"Proxy IP: {proxy_ip}")
+            print(f"IP Verification Service: {url}")
+            return True
+        else:
+            print("\nWarning: Proxy not working - IPs match!")
+            return False
+            
+    except Exception as e:
+        print(f"\nProxy verification failed: {str(e)}")
+        return False
+
+def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]:
+    """Verify proxy usage and return traffic stats"""
+    start_size = 0
+    response = session.get(url, stream=True)
+    content_size = len(response.content)
+    
+    return {
+        "status_code": response.status_code,
+        "content_size": content_size,
+        "headers": dict(response.headers),
+        "proxy_used": bool(session.proxies)
+    }
+
+def search_tech_jobs_with_proxies() -> Optional[pd.DataFrame]:
+    # Comprehensive search configuration
+    search_config = {
+        # Search parameters
+        'search_term': 'IT Engineer',
+        'location': 'Lone Tree, CO',
+        'distance': 25,
+        'results_wanted': 50,
+        'job_type': 'fulltime',
+        'hours_old': 72,
+        
+        # Filter settings
+        'exclude_clearance': True,
+        'search_sites': ["indeed", "linkedin"],
+        
+        # Proxy settings
+        'use_proxy': True,  # Proxy kill switch
+        'proxy_list': [
+            "http://brd-customer-hl_92b00ed6-zone-residential_proxies_us:5t01plrkfs6y@brd.superproxy.io:33335",
+            "http://brd-customer-hl_92b00ed6-zone-residential_proxy2_us:uyfjctxhc8t4@brd.superproxy.io:33335"
+        ],
+        
+        # Clearance keywords to filter
+        'clearance_keywords': [
+            'clearance', 'security clearance', 'secret', 'top secret', 
+            'ts/sci', 'sci', 'classified', 'poly', 'polygraph',
+            'public trust', 'security+', 'security plus'
+        ],
+        
+        # Additional settings for better results
+        'max_retries_per_proxy': 2,    # Number of retries per proxy
+        'verify_timeout': 15,          # Timeout for proxy verification
+        'date_format': '%Y-%m-%d',     # Standardize date format
+        'strict_location': True,       # Enforce stricter location filtering
+        
+        # Location verification
+        'location_center': {
+            'lat': 39.5486,  # Lone Tree coordinates
+            'lon': -104.8719
+        },
+        'max_distance': 25,  # miles
+        
+        # Debug settings
+        'show_filtered_jobs': False,   # Option to show filtered out jobs
+        'debug_mode': False,           # Additional debugging information
+        'debug': {
+            'show_traffic': True,
+            'log_requests': True,
+            'show_proxy_usage': True
+        }
+    }
+
+    max_retries = 3
+    retry_count = 0
+
+    # Proxy verification and kill switch
+    if search_config['use_proxy']:
+        print("\nVerifying proxy configuration...")
+        proxy_verified = False
+        for proxy in search_config['proxy_list']:
+            if verify_proxy(proxy):
+                proxy_verified = True
+                break
+        
+        if not proxy_verified:
+            print("\nNo working proxies found! Exiting for safety...")
+            sys.exit(1)
+    else:
+        print("\nWARNING: Running without proxy! This may result in IP blocking.")
+        user_input = input("Continue without proxy? (yes/no): ")
+        if user_input.lower() != 'yes':
+            print("Exiting...")
+            sys.exit(0)
+
+    while retry_count < max_retries:
+        current_proxy = search_config['proxy_list'][retry_count % len(search_config['proxy_list'])] if search_config['use_proxy'] else None
+        
+        try:
+            print(f"\nAttempt {retry_count + 1} of {max_retries}")
+            if current_proxy:
+                print(f"Using proxy: {current_proxy}")
+            print(f"Searching for: {search_config['search_term']} in {search_config['location']}")
+            print(f"Distance: {search_config['distance']} miles")
+            print(f"Job Type: {search_config['job_type']}")
+            print(f"Posts from last: {search_config['hours_old']} hours")
+            print(f"Excluding clearance jobs: {search_config['exclude_clearance']}")
+            print(f"Searching on: {', '.join(search_config['search_sites'])}")
+
+            jobs = scrape_jobs(
+                site_name=search_config['search_sites'],
+                search_term=search_config['search_term'],
+                location=search_config['location'],
+                distance=search_config['distance'],
+                results_wanted=search_config['results_wanted'],
+                job_type=search_config['job_type'],
+                hours_old=search_config['hours_old'],
+                country_indeed="USA",
+                description_format="markdown",
+                verbose=2,
+                proxy=current_proxy,
+                verify=False if current_proxy else certifi.where(),  # Disable SSL verify when using proxy
+            )
+
+            if not isinstance(jobs, pd.DataFrame):
+                print("Invalid response format from job search.")
+                retry_count += 1
+                continue
+
+            if jobs.empty:
+                print("No jobs found with current search parameters.")
+                retry_count += 1
+                continue
+            
+            print(f"\nInitial jobs found: {len(jobs)}")
+            
+            # Track filtered jobs
+            filtered_jobs = {
+                'clearance': 0,
+                'location': 0,
+                'date': 0
+            }
+            
+            if search_config['exclude_clearance']:
+                original_count = len(jobs)
+                pattern = '|'.join(search_config['clearance_keywords'])
+                clearance_mask = ~(
+                    jobs['title'].str.lower().str.contains(pattern, na=False) |
+                    jobs['description'].str.lower().str.contains(pattern, na=False)
+                )
+                filtered_jobs['clearance'] = original_count - len(jobs[clearance_mask])
+                jobs = jobs[clearance_mask]
+            
+            # Fix date formatting
+            jobs['date_posted'] = pd.to_datetime(jobs['date_posted'], errors='coerce')
+            date_mask = jobs['date_posted'].notna()
+            filtered_jobs['date'] = len(jobs) - len(jobs[date_mask])
+            jobs = jobs[date_mask]
+            
+            # Location filtering
+            if search_config['strict_location']:
+                location_mask = jobs['location'].apply(
+                    lambda x: is_within_radius(x, 
+                                             search_config['location_center'],
+                                             search_config['max_distance'])
+                )
+                filtered_jobs['location'] = len(jobs) - len(jobs[location_mask])
+                jobs = jobs[location_mask]
+            
+            # Print filtering summary
+            print("\nFiltering Summary:")
+            for reason, count in filtered_jobs.items():
+                if count > 0:
+                    print(f"Removed {count} jobs due to {reason}")
+            
+            # Save results
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            csv_filename = f"it_jobs_{timestamp}.csv"
+            
+            # Print job summary
+            print("\nJob Listings Found:")
+            print("-------------------")
+            for idx, job in jobs.iterrows():
+                print(f"\n{idx + 1}. {job.get('title', 'No title')}")
+                print(f"   Company: {job.get('company', 'No company')}")
+                print(f"   Location: {job.get('location', 'No location')}")
+                print(f"   Source: {job.get('site', 'No source')}")
+                print(f"   Date Posted: {job.get('date_posted', 'No date')}")
+            
+            # Save to CSV
+            jobs.to_csv(
+                csv_filename,
+                quoting=csv.QUOTE_NONNUMERIC,
+                escapechar="\\",
+                index=False
+            )
+            
+            print(f"\nResults saved to: {csv_filename}")
+            return jobs
+            
+        except Exception as e:
+            print(f"\nError with proxy {current_proxy}:")
+            print(f"Error details: {str(e)}")
+            retry_count += 1
+            
+            if retry_count < max_retries:
+                wait_time = 5 * (retry_count)
+                print(f"\nWaiting {wait_time} seconds before trying next proxy...")
+                time.sleep(wait_time)
+            else:
+                print("\nAll attempts failed. Please try again later.")
+
+    return None
+
+def calculate_distance(job_location, search_location):
+    """
+    Placeholder for distance calculation.
+    In a full implementation, this would use geocoding and actual distance calculation.
+    """
+    return "Unknown"  # Would need geocoding API to calculate actual distances
+
+def is_within_radius(job_location: str, center: dict, max_distance: int) -> bool:
+    """Verify if job location is within specified radius"""
+    try:
+        # Add geocoding logic here if needed
+        return True  # Placeholder for now
+    except Exception:
+        return False
+
+if __name__ == "__main__":
+    print("Starting job search...")
+    jobs = search_tech_jobs_with_proxies()
+    
+    if jobs is not None and not jobs.empty:
+        print("\nSearch completed successfully!")
+        print(f"Total jobs found: {len(jobs)}")
+        print("\nJobs by source:")
+        print(jobs['site'].value_counts())
+    else:
+        print("\nNo results found. Try adjusting search parameters.")
--- a/proxy_utils.py
+++ b/proxy_utils.py
@ -0,0 +1,88 @@
+from typing import Dict, Any, Optional
+from requests import Session, Response
+import requests
+import warnings
+import urllib3
+
+# Suppress SSL warnings
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+def verify_proxy(proxy: str, verification_urls: list) -> bool:
+    """Verify proxy is working and hiding the real IP"""
+    try:
+        # First check real IP
+        real_ip = get_real_ip(verification_urls)
+        if not real_ip:
+            print("Could not verify real IP")
+            return False
+            
+        proxy_ip = get_proxy_ip(proxy, verification_urls)
+        if not proxy_ip:
+            print("Could not verify proxy IP")
+            return False
+            
+        if real_ip != proxy_ip:
+            print(f"\nProxy verification successful!")
+            print(f"Real IP: {real_ip[:3]}... (hidden for security)")
+            print(f"Proxy IP: {proxy_ip}")
+            return True
+        else:
+            print("\nWarning: Proxy not working - IPs match!")
+            return False
+            
+    except Exception as e:
+        print(f"\nProxy verification failed: {str(e)}")
+        return False
+
+def verify_proxy_usage(session: Session, url: str) -> Dict[str, Any]:
+    """Verify proxy usage and return traffic stats"""
+    try:
+        response = session.get(url, stream=True)
+        content_size = len(response.content)
+        
+        return {
+            "status_code": response.status_code,
+            "content_size": content_size,
+            "headers": dict(response.headers),
+            "proxy_used": bool(session.proxies)
+        }
+    except Exception as e:
+        print(f"Error tracking proxy usage: {str(e)}")
+        return {
+            "status_code": 0,
+            "content_size": 0,
+            "headers": {},
+            "proxy_used": False
+        }
+
+def get_real_ip(verification_urls: list) -> Optional[str]:
+    """Get real IP address without proxy"""
+    for url in verification_urls:
+        try:
+            response = requests.get(url, timeout=5)
+            if response.ok:
+                return extract_ip(response, url)
+        except:
+            continue
+    return None
+
+def get_proxy_ip(proxy: str, verification_urls: list) -> Optional[str]:
+    """Get IP address when using proxy"""
+    proxies = {'http': proxy, 'https': proxy}
+    session = requests.Session()
+    session.verify = False
+    
+    for url in verification_urls:
+        try:
+            response = session.get(url, proxies=proxies, timeout=10)
+            if response.ok:
+                return extract_ip(response, url)
+        except:
+            continue
+    return None
+
+def extract_ip(response: Response, url: str) -> str:
+    """Extract IP from response based on service used"""
+    if 'ifconfig.me' in url:
+        return response.text
+    return response.json().get('ip', response.text) 
--- a/setup_config.py
+++ b/setup_config.py
@ -0,0 +1,46 @@
+"""
+Helper script to set up configuration files
+"""
+import os
+import shutil
+from getpass import getpass
+
+def setup_config():
+    # Check if config_sensitive.py already exists
+    if os.path.exists('config_sensitive.py'):
+        overwrite = input("config_sensitive.py already exists. Overwrite? (yes/no): ")
+        if overwrite.lower() != 'yes':
+            print("Setup cancelled.")
+            return
+
+    # Copy template
+    shutil.copy2('config_sensitive_template.py', 'config_sensitive.py')
+    
+    # Get proxy configuration
+    use_proxy = input("Do you want to use proxies? (yes/no): ").lower() == 'yes'
+    
+    if use_proxy:
+        proxy_url = input("Enter proxy URL (format: http://host:port): ")
+        username = input("Proxy username: ")
+        password = getpass("Proxy password: ")
+        
+        # Create proxy string
+        proxy = f"http://{username}:{password}@{proxy_url.split('//')[1]}"
+        
+        # Update config file
+        with open('config_sensitive.py', 'r') as f:
+            content = f.read()
+        
+        content = content.replace(
+            '"http://your-username:your-password@your-proxy-host:port"',
+            f'"{proxy}"'
+        )
+        
+        with open('config_sensitive.py', 'w') as f:
+            f.write(content)
+    
+    print("\nConfiguration file created successfully!")
+    print("Remember to add config_sensitive.py to .gitignore")
+
+if __name__ == "__main__":
+    setup_config()