mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
Compare commits
7 Commits
d4d52d05f5
...
1.1.82
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
abd5878238 | ||
|
|
ae2b1ea42c | ||
|
|
53b3b41385 | ||
|
|
9aae02453d | ||
|
|
94d413bad1 | ||
|
|
61205bcc77 | ||
|
|
f1602eca70 |
10
README.md
10
README.md
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
|
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently
|
||||||
- Aggregates the job postings in a dataframe
|
- Aggregates the job postings in a dataframe
|
||||||
- Proxies support to bypass blocking
|
- Proxies support to bypass blocking
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ import csv
|
|||||||
from jobspy import scrape_jobs
|
from jobspy import scrape_jobs
|
||||||
|
|
||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
|
site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||||
location="San Francisco, CA",
|
location="San Francisco, CA",
|
||||||
@@ -59,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
|||||||
```plaintext
|
```plaintext
|
||||||
Optional
|
Optional
|
||||||
├── site_name (list|str):
|
├── site_name (list|str):
|
||||||
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt
|
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs
|
||||||
| (default is all)
|
| (default is all)
|
||||||
│
|
│
|
||||||
├── search_term (str)
|
├── search_term (str)
|
||||||
@@ -86,6 +86,10 @@ Optional
|
|||||||
│
|
│
|
||||||
├── easy_apply (bool):
|
├── easy_apply (bool):
|
||||||
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
||||||
|
|
|
||||||
|
├── user_agent (str):
|
||||||
|
| override the default user agent which may be outdated
|
||||||
|
|
|
||||||
│
|
│
|
||||||
├── description_format (str):
|
├── description_format (str):
|
||||||
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from typing import Tuple
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from jobspy.bayt import BaytScraper
|
from jobspy.bayt import BaytScraper
|
||||||
|
from jobspy.bdjobs import BDJobs
|
||||||
from jobspy.glassdoor import Glassdoor
|
from jobspy.glassdoor import Glassdoor
|
||||||
from jobspy.google import Google
|
from jobspy.google import Google
|
||||||
from jobspy.indeed import Indeed
|
from jobspy.indeed import Indeed
|
||||||
@@ -25,6 +26,8 @@ from jobspy.util import (
|
|||||||
from jobspy.ziprecruiter import ZipRecruiter
|
from jobspy.ziprecruiter import ZipRecruiter
|
||||||
|
|
||||||
|
|
||||||
|
# Update the SCRAPER_MAPPING dictionary in the scrape_jobs function
|
||||||
|
|
||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||||
search_term: str | None = None,
|
search_term: str | None = None,
|
||||||
@@ -45,6 +48,7 @@ def scrape_jobs(
|
|||||||
hours_old: int = None,
|
hours_old: int = None,
|
||||||
enforce_annual_salary: bool = False,
|
enforce_annual_salary: bool = False,
|
||||||
verbose: int = 0,
|
verbose: int = 0,
|
||||||
|
user_agent: str = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
@@ -59,6 +63,7 @@ def scrape_jobs(
|
|||||||
Site.GOOGLE: Google,
|
Site.GOOGLE: Google,
|
||||||
Site.BAYT: BaytScraper,
|
Site.BAYT: BaytScraper,
|
||||||
Site.NAUKRI: Naukri,
|
Site.NAUKRI: Naukri,
|
||||||
|
Site.BDJOBS: BDJobs, # Add BDJobs to the scraper mapping
|
||||||
}
|
}
|
||||||
set_logger_level(verbose)
|
set_logger_level(verbose)
|
||||||
job_type = get_enum_from_value(job_type) if job_type else None
|
job_type = get_enum_from_value(job_type) if job_type else None
|
||||||
@@ -98,7 +103,7 @@ def scrape_jobs(
|
|||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
|
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
cap_name = site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
@@ -212,4 +217,10 @@ def scrape_jobs(
|
|||||||
by=["site", "date_posted"], ascending=[True, False]
|
by=["site", "date_posted"], ascending=[True, False]
|
||||||
).reset_index(drop=True)
|
).reset_index(drop=True)
|
||||||
else:
|
else:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
# Add BDJobs to __all__
|
||||||
|
__all__ = [
|
||||||
|
"BDJobs",
|
||||||
|
]
|
||||||
@@ -25,7 +25,7 @@ class BaytScraper(Scraper):
|
|||||||
band_delay = 3
|
band_delay = 3
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
|
|||||||
353
jobspy/bdjobs/__init__.py
Normal file
353
jobspy/bdjobs/__init__.py
Normal file
@@ -0,0 +1,353 @@
|
|||||||
|
# __init__.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, List, Dict, Any
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
from jobspy.exception import BDJobsException
|
||||||
|
from jobspy.bdjobs.constant import headers, search_params
|
||||||
|
from jobspy.bdjobs.util import (
|
||||||
|
parse_location,
|
||||||
|
parse_date,
|
||||||
|
find_job_listings,
|
||||||
|
is_job_remote,
|
||||||
|
)
|
||||||
|
from jobspy.model import (
|
||||||
|
JobPost,
|
||||||
|
Location,
|
||||||
|
JobResponse,
|
||||||
|
Country,
|
||||||
|
Scraper,
|
||||||
|
ScraperInput,
|
||||||
|
Site,
|
||||||
|
DescriptionFormat,
|
||||||
|
)
|
||||||
|
from jobspy.util import (
|
||||||
|
extract_emails_from_text,
|
||||||
|
create_session,
|
||||||
|
create_logger,
|
||||||
|
remove_attributes,
|
||||||
|
markdown_converter,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = create_logger("BDJobs")
|
||||||
|
|
||||||
|
|
||||||
|
class BDJobs(Scraper):
|
||||||
|
base_url = "https://jobs.bdjobs.com"
|
||||||
|
search_url = "https://jobs.bdjobs.com/jobsearch.asp"
|
||||||
|
delay = 2
|
||||||
|
band_delay = 3
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initializes BDJobsScraper with the BDJobs job search url
|
||||||
|
"""
|
||||||
|
super().__init__(Site.BDJOBS, proxies=proxies, ca_cert=ca_cert)
|
||||||
|
self.session = create_session(
|
||||||
|
proxies=self.proxies,
|
||||||
|
ca_cert=ca_cert,
|
||||||
|
is_tls=False,
|
||||||
|
has_retry=True,
|
||||||
|
delay=5,
|
||||||
|
clear_cookies=True,
|
||||||
|
)
|
||||||
|
self.session.headers.update(headers)
|
||||||
|
self.scraper_input = None
|
||||||
|
self.country = "bangladesh"
|
||||||
|
|
||||||
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
|
"""
|
||||||
|
Scrapes BDJobs for jobs with scraper_input criteria
|
||||||
|
:param scraper_input:
|
||||||
|
:return: job_response
|
||||||
|
"""
|
||||||
|
self.scraper_input = scraper_input
|
||||||
|
job_list: list[JobPost] = []
|
||||||
|
seen_ids = set()
|
||||||
|
page = 1
|
||||||
|
request_count = 0
|
||||||
|
|
||||||
|
# Set up search parameters
|
||||||
|
params = search_params.copy()
|
||||||
|
params["txtsearch"] = scraper_input.search_term
|
||||||
|
|
||||||
|
continue_search = lambda: len(job_list) < scraper_input.results_wanted
|
||||||
|
|
||||||
|
while continue_search():
|
||||||
|
request_count += 1
|
||||||
|
log.info(f"search page: {request_count}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Add page parameter if needed
|
||||||
|
if page > 1:
|
||||||
|
params["pg"] = page
|
||||||
|
|
||||||
|
response = self.session.get(
|
||||||
|
self.search_url,
|
||||||
|
params=params,
|
||||||
|
timeout=getattr(scraper_input, "request_timeout", 60),
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
log.error(f"BDJobs response status code {response.status_code}")
|
||||||
|
break
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
job_cards = find_job_listings(soup)
|
||||||
|
|
||||||
|
if not job_cards or len(job_cards) == 0:
|
||||||
|
log.info("No more job listings found")
|
||||||
|
break
|
||||||
|
|
||||||
|
log.info(f"Found {len(job_cards)} job cards on page {page}")
|
||||||
|
|
||||||
|
for job_card in job_cards:
|
||||||
|
try:
|
||||||
|
job_post = self._process_job(job_card)
|
||||||
|
if job_post and job_post.id not in seen_ids:
|
||||||
|
seen_ids.add(job_post.id)
|
||||||
|
job_list.append(job_post)
|
||||||
|
|
||||||
|
if not continue_search():
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Error processing job card: {str(e)}")
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
# Add delay between requests
|
||||||
|
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Error during scraping: {str(e)}")
|
||||||
|
break
|
||||||
|
|
||||||
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
|
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
|
||||||
|
"""
|
||||||
|
Processes a job card element into a JobPost object
|
||||||
|
:param job_card: Job card element
|
||||||
|
:return: JobPost object
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract job ID and URL
|
||||||
|
job_link = job_card.find("a", href=lambda h: h and "jobdetail" in h.lower())
|
||||||
|
if not job_link:
|
||||||
|
return None
|
||||||
|
|
||||||
|
job_url = job_link.get("href")
|
||||||
|
if not job_url.startswith("http"):
|
||||||
|
job_url = urljoin(self.base_url, job_url)
|
||||||
|
|
||||||
|
# Extract job ID from URL
|
||||||
|
job_id = (
|
||||||
|
job_url.split("jobid=")[-1].split("&")[0]
|
||||||
|
if "jobid=" in job_url
|
||||||
|
else f"bdjobs-{hash(job_url)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract title
|
||||||
|
title = job_link.get_text(strip=True)
|
||||||
|
if not title:
|
||||||
|
title_elem = job_card.find(
|
||||||
|
["h2", "h3", "h4", "strong", "div"],
|
||||||
|
class_=lambda c: c and "job-title-text" in c,
|
||||||
|
)
|
||||||
|
title = title_elem.get_text(strip=True) if title_elem else "N/A"
|
||||||
|
|
||||||
|
# Extract company name - IMPROVED
|
||||||
|
company_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c and "comp-name-text" in (c or "").lower(),
|
||||||
|
)
|
||||||
|
if company_elem:
|
||||||
|
company_name = company_elem.get_text(strip=True)
|
||||||
|
else:
|
||||||
|
# Try alternative selectors
|
||||||
|
company_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["company", "org", "comp-name"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
company_name = (
|
||||||
|
company_elem.get_text(strip=True) if company_elem else "N/A"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract location
|
||||||
|
location_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c and "locon-text-d" in (c or "").lower(),
|
||||||
|
)
|
||||||
|
if not location_elem:
|
||||||
|
location_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["location", "area", "locon"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
location_text = (
|
||||||
|
location_elem.get_text(strip=True)
|
||||||
|
if location_elem
|
||||||
|
else "Dhaka, Bangladesh"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create Location object
|
||||||
|
location = parse_location(location_text, self.country)
|
||||||
|
|
||||||
|
# Extract date posted
|
||||||
|
date_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["date", "deadline", "published"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
date_posted = None
|
||||||
|
if date_elem:
|
||||||
|
date_text = date_elem.get_text(strip=True)
|
||||||
|
date_posted = parse_date(date_text)
|
||||||
|
|
||||||
|
# Check if job is remote
|
||||||
|
is_remote = is_job_remote(title, location=location)
|
||||||
|
|
||||||
|
# Create job post object
|
||||||
|
job_post = JobPost(
|
||||||
|
id=job_id,
|
||||||
|
title=title,
|
||||||
|
company_name=company_name, # Use company_name instead of company
|
||||||
|
location=location,
|
||||||
|
date_posted=date_posted,
|
||||||
|
job_url=job_url,
|
||||||
|
is_remote=is_remote,
|
||||||
|
site=self.site,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Always fetch description for BDJobs
|
||||||
|
job_details = self._get_job_details(job_url)
|
||||||
|
job_post.description = job_details.get("description", "")
|
||||||
|
job_post.job_type = job_details.get("job_type", "")
|
||||||
|
|
||||||
|
return job_post
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Error in _process_job: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_job_details(self, job_url: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Gets detailed job information from the job page
|
||||||
|
:param job_url: Job page URL
|
||||||
|
:return: Dictionary with job details
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
response = self.session.get(job_url, timeout=60)
|
||||||
|
if response.status_code != 200:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# Find job description - IMPROVED based on correct.py
|
||||||
|
description = ""
|
||||||
|
|
||||||
|
# Try to find the job content div first (as in correct.py)
|
||||||
|
job_content_div = soup.find("div", class_="jobcontent")
|
||||||
|
if job_content_div:
|
||||||
|
# Look for responsibilities section
|
||||||
|
responsibilities_heading = job_content_div.find(
|
||||||
|
"h4", id="job_resp"
|
||||||
|
) or job_content_div.find(
|
||||||
|
["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower()
|
||||||
|
)
|
||||||
|
if responsibilities_heading:
|
||||||
|
responsibilities_elements = []
|
||||||
|
# Find all following elements until the next heading or hr
|
||||||
|
for sibling in responsibilities_heading.find_next_siblings():
|
||||||
|
if sibling.name in ["hr", "h4", "h5"]:
|
||||||
|
break
|
||||||
|
if sibling.name == "ul":
|
||||||
|
responsibilities_elements.extend(
|
||||||
|
li.get_text(separator=" ", strip=True)
|
||||||
|
for li in sibling.find_all("li")
|
||||||
|
)
|
||||||
|
elif sibling.name == "p":
|
||||||
|
responsibilities_elements.append(
|
||||||
|
sibling.get_text(separator=" ", strip=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
description = (
|
||||||
|
"\n".join(responsibilities_elements)
|
||||||
|
if responsibilities_elements
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
# If no description found yet, try the original approach
|
||||||
|
if not description:
|
||||||
|
description_elem = soup.find(
|
||||||
|
["div", "section"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["job-description", "details", "requirements"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if description_elem:
|
||||||
|
description_elem = remove_attributes(description_elem)
|
||||||
|
description = description_elem.prettify(formatter="html")
|
||||||
|
if (
|
||||||
|
hasattr(self.scraper_input, "description_format")
|
||||||
|
and self.scraper_input.description_format
|
||||||
|
== DescriptionFormat.MARKDOWN
|
||||||
|
):
|
||||||
|
description = markdown_converter(description)
|
||||||
|
|
||||||
|
# Extract job type
|
||||||
|
job_type_elem = soup.find(
|
||||||
|
["span", "div"],
|
||||||
|
string=lambda s: s
|
||||||
|
and any(
|
||||||
|
term in (s or "").lower()
|
||||||
|
for term in ["job type", "employment type"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
job_type = None
|
||||||
|
if job_type_elem:
|
||||||
|
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(
|
||||||
|
strip=True
|
||||||
|
)
|
||||||
|
job_type = job_type_text if job_type_text else None
|
||||||
|
|
||||||
|
# Extract company industry
|
||||||
|
industry_elem = soup.find(
|
||||||
|
["span", "div"], string=lambda s: s and "industry" in (s or "").lower()
|
||||||
|
)
|
||||||
|
company_industry = None
|
||||||
|
if industry_elem:
|
||||||
|
industry_text = industry_elem.find_next(["span", "div"]).get_text(
|
||||||
|
strip=True
|
||||||
|
)
|
||||||
|
company_industry = industry_text if industry_text else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"description": description,
|
||||||
|
"job_type": job_type,
|
||||||
|
"company_industry": company_industry,
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Error getting job details: {str(e)}")
|
||||||
|
return {}
|
||||||
32
jobspy/bdjobs/constant.py
Normal file
32
jobspy/bdjobs/constant.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
#constant.py
|
||||||
|
# Headers for BDJobs requests
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Referer": "https://jobs.bdjobs.com/",
|
||||||
|
"Cache-Control": "max-age=0",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Search parameters that work best for BDJobs
|
||||||
|
search_params = {
|
||||||
|
"hidJobSearch": "jobsearch",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Selectors for job listings
|
||||||
|
job_selectors = [
|
||||||
|
"div.job-item", # Catches both normal and premium job cards, as well as other types
|
||||||
|
"div.sout-jobs-wrapper", # Catches job listings in the main search results page
|
||||||
|
"div.norm-jobs-wrapper", # Catches normal job listings
|
||||||
|
"div.featured-wrap", # Catches featured job listings
|
||||||
|
]
|
||||||
|
|
||||||
|
# Date formats used by BDJobs
|
||||||
|
date_formats = [
|
||||||
|
"%d %b %Y",
|
||||||
|
"%d-%b-%Y",
|
||||||
|
"%d %B %Y",
|
||||||
|
"%B %d, %Y",
|
||||||
|
"%d/%m/%Y",
|
||||||
|
]
|
||||||
100
jobspy/bdjobs/util.py
Normal file
100
jobspy/bdjobs/util.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#util.py
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, List, Dict, Any
|
||||||
|
|
||||||
|
from jobspy.model import Location, Country
|
||||||
|
|
||||||
|
|
||||||
|
def parse_location(location_text: str, country: str = "bangladesh") -> Location:
|
||||||
|
"""
|
||||||
|
Parses location text into a Location object
|
||||||
|
:param location_text: Location text from job listing
|
||||||
|
:param country: Default country
|
||||||
|
:return: Location object
|
||||||
|
"""
|
||||||
|
parts = location_text.split(",")
|
||||||
|
if len(parts) >= 2:
|
||||||
|
city = parts[0].strip()
|
||||||
|
state = parts[1].strip()
|
||||||
|
return Location(
|
||||||
|
city=city,
|
||||||
|
state=state,
|
||||||
|
country=Country.from_string(country)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return Location(
|
||||||
|
city=location_text.strip(),
|
||||||
|
country=Country.from_string(country)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date(date_text: str) -> Optional[datetime]:
|
||||||
|
"""
|
||||||
|
Parses date text into a datetime object
|
||||||
|
:param date_text: Date text from job listing
|
||||||
|
:return: datetime object or None if parsing fails
|
||||||
|
"""
|
||||||
|
from .constant import date_formats
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Clean up date text
|
||||||
|
if "Deadline:" in date_text:
|
||||||
|
date_text = date_text.replace("Deadline:", "").strip()
|
||||||
|
|
||||||
|
# Try different date formats
|
||||||
|
for fmt in date_formats:
|
||||||
|
try:
|
||||||
|
return datetime.strptime(date_text, fmt)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_job_listings(soup: BeautifulSoup) -> List[Any]:
|
||||||
|
"""
|
||||||
|
Finds job listing elements in the HTML
|
||||||
|
:param soup: BeautifulSoup object
|
||||||
|
:return: List of job card elements
|
||||||
|
"""
|
||||||
|
from .constant import job_selectors
|
||||||
|
|
||||||
|
# Try different selectors
|
||||||
|
for selector in job_selectors:
|
||||||
|
if "." in selector:
|
||||||
|
tag_name, class_name = selector.split(".", 1)
|
||||||
|
elements = soup.find_all(tag_name, class_=class_name)
|
||||||
|
if elements and len(elements) > 0:
|
||||||
|
return elements
|
||||||
|
|
||||||
|
# If no selectors match, look for job detail links
|
||||||
|
job_links = soup.find_all("a", href=lambda h: h and "jobdetail" in h.lower())
|
||||||
|
if job_links:
|
||||||
|
# Return parent elements of job links
|
||||||
|
return [link.parent for link in job_links]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def is_job_remote(title: str, description: str = None, location: Location = None) -> bool:
|
||||||
|
"""
|
||||||
|
Determines if a job is remote based on title, description, and location
|
||||||
|
:param title: Job title
|
||||||
|
:param description: Job description
|
||||||
|
:param location: Job location
|
||||||
|
:return: True if job is remote, False otherwise
|
||||||
|
"""
|
||||||
|
remote_keywords = ["remote", "work from home", "wfh", "home based"]
|
||||||
|
|
||||||
|
# Combine all text fields
|
||||||
|
full_text = title.lower()
|
||||||
|
if description:
|
||||||
|
full_text += " " + description.lower()
|
||||||
|
if location:
|
||||||
|
full_text += " " + location.display_location().lower()
|
||||||
|
|
||||||
|
# Check for remote keywords
|
||||||
|
return any(keyword in full_text for keyword in remote_keywords)
|
||||||
@@ -37,4 +37,9 @@ class BaytException(Exception):
|
|||||||
|
|
||||||
class NaukriException(Exception):
|
class NaukriException(Exception):
|
||||||
def __init__(self,message=None):
|
def __init__(self,message=None):
|
||||||
super().__init__(message or "An error occurred with Naukri")
|
super().__init__(message or "An error occurred with Naukri")
|
||||||
|
|
||||||
|
|
||||||
|
class BDJobsException(Exception):
|
||||||
|
def __init__(self, message=None):
|
||||||
|
super().__init__(message or "An error occurred with BDJobs")
|
||||||
@@ -34,13 +34,13 @@ log = create_logger("Glassdoor")
|
|||||||
|
|
||||||
class Glassdoor(Scraper):
|
class Glassdoor(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.GLASSDOOR)
|
site = Site(Site.GLASSDOOR)
|
||||||
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
super().__init__(site, proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
|
||||||
|
|
||||||
self.base_url = None
|
self.base_url = None
|
||||||
self.country = None
|
self.country = None
|
||||||
@@ -65,6 +65,8 @@ class Glassdoor(Scraper):
|
|||||||
)
|
)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
headers["gd-csrf-token"] = token if token else fallback_token
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
|
if self.user_agent:
|
||||||
|
headers["user-agent"] = self.user_agent
|
||||||
self.session.headers.update(headers)
|
self.session.headers.update(headers)
|
||||||
|
|
||||||
location_id, location_type = self._get_location(
|
location_id, location_type = self._get_location(
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ headers = {
|
|||||||
"sec-fetch-dest": "empty",
|
"sec-fetch-dest": "empty",
|
||||||
"sec-fetch-mode": "cors",
|
"sec-fetch-mode": "cors",
|
||||||
"sec-fetch-site": "same-origin",
|
"sec-fetch-site": "same-origin",
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||||
}
|
}
|
||||||
query_template = """
|
query_template = """
|
||||||
query JobSearchResultsQuery(
|
query JobSearchResultsQuery(
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ from jobspy.google.util import log, find_job_info_initial_page, find_job_info
|
|||||||
|
|
||||||
class Google(Scraper):
|
class Google(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes Google Scraper with the Goodle jobs search url
|
Initializes Google Scraper with the Goodle jobs search url
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ log = create_logger("Indeed")
|
|||||||
|
|
||||||
class Indeed(Scraper):
|
class Indeed(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes IndeedScraper with the Indeed API url
|
Initializes IndeedScraper with the Indeed API url
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class LinkedIn(Scraper):
|
|||||||
jobs_per_page = 25
|
jobs_per_page = 25
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes LinkedInScraper with the LinkedIn job search url
|
Initializes LinkedInScraper with the LinkedIn job search url
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ class Country(Enum):
|
|||||||
AUSTRALIA = ("australia", "au", "com.au")
|
AUSTRALIA = ("australia", "au", "com.au")
|
||||||
AUSTRIA = ("austria", "at", "at")
|
AUSTRIA = ("austria", "at", "at")
|
||||||
BAHRAIN = ("bahrain", "bh")
|
BAHRAIN = ("bahrain", "bh")
|
||||||
|
BANGLADESH = ("bangladesh", "bd") # Added Bangladesh
|
||||||
BELGIUM = ("belgium", "be", "fr:be")
|
BELGIUM = ("belgium", "be", "fr:be")
|
||||||
BULGARIA = ("bulgaria", "bg")
|
BULGARIA = ("bulgaria", "bg")
|
||||||
BRAZIL = ("brazil", "br", "com.br")
|
BRAZIL = ("brazil", "br", "com.br")
|
||||||
@@ -291,6 +292,7 @@ class Site(Enum):
|
|||||||
GOOGLE = "google"
|
GOOGLE = "google"
|
||||||
BAYT = "bayt"
|
BAYT = "bayt"
|
||||||
NAUKRI = "naukri"
|
NAUKRI = "naukri"
|
||||||
|
BDJOBS = "bdjobs" # Add this line
|
||||||
|
|
||||||
|
|
||||||
class SalarySource(Enum):
|
class SalarySource(Enum):
|
||||||
@@ -314,17 +316,20 @@ class ScraperInput(BaseModel):
|
|||||||
linkedin_company_ids: list[int] | None = None
|
linkedin_company_ids: list[int] | None = None
|
||||||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||||||
|
|
||||||
|
request_timeout: int = 60
|
||||||
|
|
||||||
results_wanted: int = 15
|
results_wanted: int = 15
|
||||||
hours_old: int | None = None
|
hours_old: int | None = None
|
||||||
|
|
||||||
|
|
||||||
class Scraper(ABC):
|
class Scraper(ABC):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
self.site = site
|
self.site = site
|
||||||
self.proxies = proxies
|
self.proxies = proxies
|
||||||
self.ca_cert = ca_cert
|
self.ca_cert = ca_cert
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date, timedelta
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
@@ -44,7 +44,7 @@ class Naukri(Scraper):
|
|||||||
jobs_per_page = 20
|
jobs_per_page = 20
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes NaukriScraper with the Naukri API URL
|
Initializes NaukriScraper with the Naukri API URL
|
||||||
@@ -277,7 +277,7 @@ class Naukri(Scraper):
|
|||||||
match = re.search(r"(\d+)\s*day", label)
|
match = re.search(r"(\d+)\s*day", label)
|
||||||
if match:
|
if match:
|
||||||
days = int(match.group(1))
|
days = int(match.group(1))
|
||||||
parsed_date = today.replace(day=today.day - days).date()
|
parsed_date = (today - timedelta(days = days)).date()
|
||||||
log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
|
log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
|
||||||
return parsed_date
|
return parsed_date
|
||||||
elif created_date:
|
elif created_date:
|
||||||
|
|||||||
@@ -47,11 +47,12 @@ class RotatingProxySession:
|
|||||||
"""Utility method to format a proxy string into a dictionary."""
|
"""Utility method to format a proxy string into a dictionary."""
|
||||||
if proxy.startswith("http://") or proxy.startswith("https://"):
|
if proxy.startswith("http://") or proxy.startswith("https://"):
|
||||||
return {"http": proxy, "https": proxy}
|
return {"http": proxy, "https": proxy}
|
||||||
|
if proxy.startswith("socks5://"):
|
||||||
|
return {"http": proxy, "https": proxy}
|
||||||
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
|
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
|
||||||
|
|
||||||
|
|
||||||
class RequestsRotating(RotatingProxySession, requests.Session):
|
class RequestsRotating(RotatingProxySession, requests.Session):
|
||||||
|
|
||||||
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
|
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
|
||||||
RotatingProxySession.__init__(self, proxies=proxies)
|
RotatingProxySession.__init__(self, proxies=proxies)
|
||||||
requests.Session.__init__(self)
|
requests.Session.__init__(self)
|
||||||
@@ -86,7 +87,6 @@ class RequestsRotating(RotatingProxySession, requests.Session):
|
|||||||
|
|
||||||
|
|
||||||
class TLSRotating(RotatingProxySession, tls_client.Session):
|
class TLSRotating(RotatingProxySession, tls_client.Session):
|
||||||
|
|
||||||
def __init__(self, proxies=None):
|
def __init__(self, proxies=None):
|
||||||
RotatingProxySession.__init__(self, proxies=proxies)
|
RotatingProxySession.__init__(self, proxies=proxies)
|
||||||
tls_client.Session.__init__(self, random_tls_extension_order=True)
|
tls_client.Session.__init__(self, random_tls_extension_order=True)
|
||||||
@@ -344,7 +344,7 @@ desired_order = [
|
|||||||
"company_num_employees",
|
"company_num_employees",
|
||||||
"company_revenue",
|
"company_revenue",
|
||||||
"company_description",
|
"company_description",
|
||||||
#naukri-specific fields
|
# naukri-specific fields
|
||||||
"skills",
|
"skills",
|
||||||
"experience_range",
|
"experience_range",
|
||||||
"company_rating",
|
"company_rating",
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class ZipRecruiter(Scraper):
|
|||||||
api_url = "https://api.ziprecruiter.com"
|
api_url = "https://api.ziprecruiter.com"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.79"
|
version = "1.1.82"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
||||||
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
||||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||||
|
|||||||
Reference in New Issue
Block a user