mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 03:54:31 -08:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
abd5878238 |
10
README.md
10
README.md
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
|
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & other job boards concurrently
|
||||||
- Aggregates the job postings in a dataframe
|
- Aggregates the job postings in a dataframe
|
||||||
- Proxies support to bypass blocking
|
- Proxies support to bypass blocking
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ import csv
|
|||||||
from jobspy import scrape_jobs
|
from jobspy import scrape_jobs
|
||||||
|
|
||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
|
site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||||
location="San Francisco, CA",
|
location="San Francisco, CA",
|
||||||
@@ -59,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
|||||||
```plaintext
|
```plaintext
|
||||||
Optional
|
Optional
|
||||||
├── site_name (list|str):
|
├── site_name (list|str):
|
||||||
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt
|
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt, bdjobs
|
||||||
| (default is all)
|
| (default is all)
|
||||||
│
|
│
|
||||||
├── search_term (str)
|
├── search_term (str)
|
||||||
@@ -86,6 +86,10 @@ Optional
|
|||||||
│
|
│
|
||||||
├── easy_apply (bool):
|
├── easy_apply (bool):
|
||||||
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
||||||
|
|
|
||||||
|
├── user_agent (str):
|
||||||
|
| override the default user agent which may be outdated
|
||||||
|
|
|
||||||
│
|
│
|
||||||
├── description_format (str):
|
├── description_format (str):
|
||||||
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#__init__.py
|
# __init__.py
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import random
|
import random
|
||||||
@@ -12,7 +12,12 @@ from bs4.element import Tag
|
|||||||
|
|
||||||
from jobspy.exception import BDJobsException
|
from jobspy.exception import BDJobsException
|
||||||
from jobspy.bdjobs.constant import headers, search_params
|
from jobspy.bdjobs.constant import headers, search_params
|
||||||
from jobspy.bdjobs.util import parse_location, parse_date, find_job_listings, is_job_remote
|
from jobspy.bdjobs.util import (
|
||||||
|
parse_location,
|
||||||
|
parse_date,
|
||||||
|
find_job_listings,
|
||||||
|
is_job_remote,
|
||||||
|
)
|
||||||
from jobspy.model import (
|
from jobspy.model import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Location,
|
Location,
|
||||||
@@ -89,17 +94,9 @@ class BDJobs(Scraper):
|
|||||||
response = self.session.get(
|
response = self.session.get(
|
||||||
self.search_url,
|
self.search_url,
|
||||||
params=params,
|
params=params,
|
||||||
timeout=getattr(scraper_input, 'request_timeout', 60)
|
timeout=getattr(scraper_input, "request_timeout", 60),
|
||||||
)
|
)
|
||||||
|
|
||||||
# DEBUG: Save the received HTML content
|
|
||||||
try:
|
|
||||||
with open("scraper_received_bdjobs.html", "w", encoding="utf-8") as f:
|
|
||||||
f.write(response.text)
|
|
||||||
log.info(f"Saved scraper response to scraper_received_bdjobs.html")
|
|
||||||
except Exception as e_write:
|
|
||||||
log.error(f"Error writing debug HTML file: {e_write}")
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
log.error(f"BDJobs response status code {response.status_code}")
|
log.error(f"BDJobs response status code {response.status_code}")
|
||||||
break
|
break
|
||||||
@@ -133,7 +130,7 @@ class BDJobs(Scraper):
|
|||||||
log.error(f"Error during scraping: {str(e)}")
|
log.error(f"Error during scraping: {str(e)}")
|
||||||
break
|
break
|
||||||
|
|
||||||
job_list = job_list[:scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
|
def _process_job(self, job_card: Tag) -> Optional[JobPost]:
|
||||||
@@ -153,34 +150,74 @@ class BDJobs(Scraper):
|
|||||||
job_url = urljoin(self.base_url, job_url)
|
job_url = urljoin(self.base_url, job_url)
|
||||||
|
|
||||||
# Extract job ID from URL
|
# Extract job ID from URL
|
||||||
job_id = job_url.split("jobid=")[-1].split("&")[0] if "jobid=" in job_url else f"bdjobs-{hash(job_url)}"
|
job_id = (
|
||||||
|
job_url.split("jobid=")[-1].split("&")[0]
|
||||||
|
if "jobid=" in job_url
|
||||||
|
else f"bdjobs-{hash(job_url)}"
|
||||||
|
)
|
||||||
|
|
||||||
# Extract title
|
# Extract title
|
||||||
title = job_link.get_text(strip=True)
|
title = job_link.get_text(strip=True)
|
||||||
if not title:
|
if not title:
|
||||||
title_elem = job_card.find(["h2", "h3", "h4", "strong", "div"], class_=lambda c: c and "job-title-text" in c)
|
title_elem = job_card.find(
|
||||||
|
["h2", "h3", "h4", "strong", "div"],
|
||||||
|
class_=lambda c: c and "job-title-text" in c,
|
||||||
|
)
|
||||||
title = title_elem.get_text(strip=True) if title_elem else "N/A"
|
title = title_elem.get_text(strip=True) if title_elem else "N/A"
|
||||||
|
|
||||||
# Extract company name - IMPROVED
|
# Extract company name - IMPROVED
|
||||||
company_elem = job_card.find(["span", "div"], class_=lambda c: c and "comp-name-text" in (c or "").lower())
|
company_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c and "comp-name-text" in (c or "").lower(),
|
||||||
|
)
|
||||||
if company_elem:
|
if company_elem:
|
||||||
company_name = company_elem.get_text(strip=True)
|
company_name = company_elem.get_text(strip=True)
|
||||||
else:
|
else:
|
||||||
# Try alternative selectors
|
# Try alternative selectors
|
||||||
company_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["company", "org", "comp-name"]))
|
company_elem = job_card.find(
|
||||||
company_name = company_elem.get_text(strip=True) if company_elem else "N/A"
|
["span", "div"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["company", "org", "comp-name"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
company_name = (
|
||||||
|
company_elem.get_text(strip=True) if company_elem else "N/A"
|
||||||
|
)
|
||||||
|
|
||||||
# Extract location
|
# Extract location
|
||||||
location_elem = job_card.find(["span", "div"], class_=lambda c: c and "locon-text-d" in (c or "").lower())
|
location_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c and "locon-text-d" in (c or "").lower(),
|
||||||
|
)
|
||||||
if not location_elem:
|
if not location_elem:
|
||||||
location_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["location", "area", "locon"]))
|
location_elem = job_card.find(
|
||||||
location_text = location_elem.get_text(strip=True) if location_elem else "Dhaka, Bangladesh"
|
["span", "div"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["location", "area", "locon"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
location_text = (
|
||||||
|
location_elem.get_text(strip=True)
|
||||||
|
if location_elem
|
||||||
|
else "Dhaka, Bangladesh"
|
||||||
|
)
|
||||||
|
|
||||||
# Create Location object
|
# Create Location object
|
||||||
location = parse_location(location_text, self.country)
|
location = parse_location(location_text, self.country)
|
||||||
|
|
||||||
# Extract date posted
|
# Extract date posted
|
||||||
date_elem = job_card.find(["span", "div"], class_=lambda c: c and any(term in (c or "").lower() for term in ["date", "deadline", "published"]))
|
date_elem = job_card.find(
|
||||||
|
["span", "div"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["date", "deadline", "published"]
|
||||||
|
),
|
||||||
|
)
|
||||||
date_posted = None
|
date_posted = None
|
||||||
if date_elem:
|
if date_elem:
|
||||||
date_text = date_elem.get_text(strip=True)
|
date_text = date_elem.get_text(strip=True)
|
||||||
@@ -228,50 +265,87 @@ class BDJobs(Scraper):
|
|||||||
description = ""
|
description = ""
|
||||||
|
|
||||||
# Try to find the job content div first (as in correct.py)
|
# Try to find the job content div first (as in correct.py)
|
||||||
job_content_div = soup.find('div', class_='jobcontent')
|
job_content_div = soup.find("div", class_="jobcontent")
|
||||||
if job_content_div:
|
if job_content_div:
|
||||||
# Look for responsibilities section
|
# Look for responsibilities section
|
||||||
responsibilities_heading = job_content_div.find('h4', id='job_resp') or job_content_div.find(['h4', 'h5'], string=lambda s: s and 'responsibilities' in s.lower())
|
responsibilities_heading = job_content_div.find(
|
||||||
|
"h4", id="job_resp"
|
||||||
|
) or job_content_div.find(
|
||||||
|
["h4", "h5"], string=lambda s: s and "responsibilities" in s.lower()
|
||||||
|
)
|
||||||
if responsibilities_heading:
|
if responsibilities_heading:
|
||||||
responsibilities_elements = []
|
responsibilities_elements = []
|
||||||
# Find all following elements until the next heading or hr
|
# Find all following elements until the next heading or hr
|
||||||
for sibling in responsibilities_heading.find_next_siblings():
|
for sibling in responsibilities_heading.find_next_siblings():
|
||||||
if sibling.name in ['hr', 'h4', 'h5']:
|
if sibling.name in ["hr", "h4", "h5"]:
|
||||||
break
|
break
|
||||||
if sibling.name == 'ul':
|
if sibling.name == "ul":
|
||||||
responsibilities_elements.extend(li.get_text(separator=' ', strip=True) for li in sibling.find_all('li'))
|
responsibilities_elements.extend(
|
||||||
elif sibling.name == 'p':
|
li.get_text(separator=" ", strip=True)
|
||||||
responsibilities_elements.append(sibling.get_text(separator=' ', strip=True))
|
for li in sibling.find_all("li")
|
||||||
|
)
|
||||||
|
elif sibling.name == "p":
|
||||||
|
responsibilities_elements.append(
|
||||||
|
sibling.get_text(separator=" ", strip=True)
|
||||||
|
)
|
||||||
|
|
||||||
description = "\n".join(responsibilities_elements) if responsibilities_elements else ""
|
description = (
|
||||||
|
"\n".join(responsibilities_elements)
|
||||||
|
if responsibilities_elements
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
# If no description found yet, try the original approach
|
# If no description found yet, try the original approach
|
||||||
if not description:
|
if not description:
|
||||||
description_elem = soup.find(["div", "section"], class_=lambda c: c and any(term in (c or "").lower() for term in ["job-description", "details", "requirements"]))
|
description_elem = soup.find(
|
||||||
|
["div", "section"],
|
||||||
|
class_=lambda c: c
|
||||||
|
and any(
|
||||||
|
term in (c or "").lower()
|
||||||
|
for term in ["job-description", "details", "requirements"]
|
||||||
|
),
|
||||||
|
)
|
||||||
if description_elem:
|
if description_elem:
|
||||||
description_elem = remove_attributes(description_elem)
|
description_elem = remove_attributes(description_elem)
|
||||||
description = description_elem.prettify(formatter="html")
|
description = description_elem.prettify(formatter="html")
|
||||||
if hasattr(self.scraper_input, 'description_format') and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if (
|
||||||
|
hasattr(self.scraper_input, "description_format")
|
||||||
|
and self.scraper_input.description_format
|
||||||
|
== DescriptionFormat.MARKDOWN
|
||||||
|
):
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
|
|
||||||
# Extract job type
|
# Extract job type
|
||||||
job_type_elem = soup.find(["span", "div"], string=lambda s: s and any(term in (s or "").lower() for term in ["job type", "employment type"]))
|
job_type_elem = soup.find(
|
||||||
|
["span", "div"],
|
||||||
|
string=lambda s: s
|
||||||
|
and any(
|
||||||
|
term in (s or "").lower()
|
||||||
|
for term in ["job type", "employment type"]
|
||||||
|
),
|
||||||
|
)
|
||||||
job_type = None
|
job_type = None
|
||||||
if job_type_elem:
|
if job_type_elem:
|
||||||
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(strip=True)
|
job_type_text = job_type_elem.find_next(["span", "div"]).get_text(
|
||||||
|
strip=True
|
||||||
|
)
|
||||||
job_type = job_type_text if job_type_text else None
|
job_type = job_type_text if job_type_text else None
|
||||||
|
|
||||||
# Extract company industry
|
# Extract company industry
|
||||||
industry_elem = soup.find(["span", "div"], string=lambda s: s and "industry" in (s or "").lower())
|
industry_elem = soup.find(
|
||||||
|
["span", "div"], string=lambda s: s and "industry" in (s or "").lower()
|
||||||
|
)
|
||||||
company_industry = None
|
company_industry = None
|
||||||
if industry_elem:
|
if industry_elem:
|
||||||
industry_text = industry_elem.find_next(["span", "div"]).get_text(strip=True)
|
industry_text = industry_elem.find_next(["span", "div"]).get_text(
|
||||||
|
strip=True
|
||||||
|
)
|
||||||
company_industry = industry_text if industry_text else None
|
company_industry = industry_text if industry_text else None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
"job_type": job_type,
|
"job_type": job_type,
|
||||||
"company_industry": company_industry
|
"company_industry": company_industry,
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.80"
|
version = "1.1.82"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
||||||
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
||||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||||
|
|||||||
Reference in New Issue
Block a user