Compare commits

..

9 Commits
1.1.82 ... main

Author SHA1 Message Date
Berkay Gemici
fda080a373 fix(linkedin): add fallback for date parsing on new job listings (#343)
LinkedIn uses two CSS classes for job posting dates:
- `job-search-card__listdate` for older posts
- `job-search-card__listdate--new` for recent posts (< 24h)

The scraper only checked the first class, causing `date_posted` to be
None for all fresh listings. This adds a fallback to also check for
the `--new` variant.
2026-02-18 13:39:52 -06:00
Sean
6e7ab6ff74 Fix: re Issue #295 (@krishianjan): added (seemingly missing) user_agent keyword argument to BDJobs 2026-01-09 23:28:27 -06:00
kj55-dev
7160d0faed fix: relax numpy version constraint to >=1.26.0 (#337) 2026-01-09 23:27:54 -06:00
Cullen Watson
6e014cf732 chore: codeowners 2025-08-23 22:42:45 +02:00
Kaushik H S
6e8576f8a8 fix(naukri): prevent str.find error by normalizing input and parsing before Markdown (#300) 2025-08-23 15:38:26 -05:00
Alexander Smirnov
51888004b7 Update __init__.py (#296)
pagination fix: start update with job_cards instead of job_list
2025-08-23 15:38:02 -05:00
Lixian Wang
b6d5cd8d79 fix:correct LinkedIn logger naming (#291)
* fix:correct LinkedIn logger naming

* add:linkedin description plain format
2025-08-23 15:37:49 -05:00
ZuoyunZheng
84ed670df3 chore: bump markdownify from 0.13.1 to 1.1.0 (#290) 2025-08-23 15:37:34 -05:00
Cullen Watson
4b16ac7967 chore:readme 2025-07-28 17:19:56 +02:00
11 changed files with 39 additions and 64 deletions

1
.github/CODEOWNERS vendored Normal file
View File

@@ -0,0 +1 @@
* @cullenwatson

View File

@@ -89,7 +89,6 @@ Optional
| |
├── user_agent (str): ├── user_agent (str):
| override the default user agent which may be outdated | override the default user agent which may be outdated
|
├── description_format (str): ├── description_format (str):
| markdown, html (Format type of the job descriptions. Default is markdown.) | markdown, html (Format type of the job descriptions. Default is markdown.)

View File

@@ -107,6 +107,7 @@ def scrape_jobs(
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
site_name = "LinkedIn" if cap_name == "Linkedin" else cap_name
create_logger(site_name).info(f"finished scraping") create_logger(site_name).info(f"finished scraping")
return site.value, scraped_data return site.value, scraped_data

View File

@@ -46,7 +46,7 @@ class BDJobs(Scraper):
band_delay = 3 band_delay = 3
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
): ):
""" """
Initializes BDJobsScraper with the BDJobs job search url Initializes BDJobsScraper with the BDJobs job search url

View File

@@ -35,6 +35,7 @@ from jobspy.util import (
extract_emails_from_text, extract_emails_from_text,
currency_parser, currency_parser,
markdown_converter, markdown_converter,
plain_converter,
create_session, create_session,
remove_attributes, remove_attributes,
create_logger, create_logger,
@@ -164,7 +165,7 @@ class LinkedIn(Scraper):
if continue_search(): if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
start += len(job_list) start += len(job_cards)
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
@@ -208,6 +209,10 @@ class LinkedIn(Scraper):
if metadata_card if metadata_card
else None else None
) )
if not datetime_tag and metadata_card:
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate--new"
)
date_posted = None date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs: if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
@@ -267,7 +272,8 @@ class LinkedIn(Scraper):
description = div_content.prettify(formatter="html") description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description) description = markdown_converter(description)
elif self.scraper_input.description_format == DescriptionFormat.PLAIN:
description = plain_converter(description)
h3_tag = soup.find( h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip() "h3", text=lambda text: text and "Job function" in text.strip()
) )

View File

@@ -234,7 +234,7 @@ class Compensation(BaseModel):
class DescriptionFormat(Enum): class DescriptionFormat(Enum):
MARKDOWN = "markdown" MARKDOWN = "markdown"
HTML = "html" HTML = "html"
PLAIN = "plain"
class JobPost(BaseModel): class JobPost(BaseModel):
id: str | None = None id: str | None = None

View File

@@ -164,12 +164,15 @@ class Naukri(Scraper):
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate")) date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}" job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
description = job.get("jobDescription") if full_descr else None raw_description = job.get("jobDescription") if full_descr else None
job_type = parse_job_type(raw_description) if raw_description else None
company_industry = parse_company_industry(raw_description) if raw_description else None
description = raw_description
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description) description = markdown_converter(description)
job_type = parse_job_type(description) if description else None
company_industry = parse_company_industry(description) if description else None
is_remote = is_job_remote(title, description or "", location) is_remote = is_job_remote(title, description or "", location)
company_logo = job.get("logoPathV3") or job.get("logoPath") company_logo = job.get("logoPathV3") or job.get("logoPath")

View File

@@ -5,10 +5,12 @@ from jobspy.model import JobType, Location
from jobspy.util import get_enum_from_job_type from jobspy.util import get_enum_from_job_type
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None: def parse_job_type(soup: BeautifulSoup |str) -> list[JobType] | None:
""" """
Gets the job type from the job page Gets the job type from the job page
""" """
if isinstance(soup, str):
soup = BeautifulSoup(soup, "html.parser")
job_type_tag = soup.find("span", class_="job-type") job_type_tag = soup.find("span", class_="job-type")
if job_type_tag: if job_type_tag:
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "") job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
@@ -16,10 +18,12 @@ def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
return None return None
def parse_company_industry(soup: BeautifulSoup) -> str | None: def parse_company_industry(soup: BeautifulSoup | str) -> str | None:
""" """
Gets the company industry from the job page Gets the company industry from the job page
""" """
if isinstance(soup, str):
soup = BeautifulSoup(soup, "html.parser")
industry_tag = soup.find("span", class_="industry") industry_tag = soup.find("span", class_="industry")
return industry_tag.get_text(strip=True) if industry_tag else None return industry_tag.get_text(strip=True) if industry_tag else None

View File

@@ -157,6 +157,15 @@ def markdown_converter(description_html: str):
markdown = md(description_html) markdown = md(description_html)
return markdown.strip() return markdown.strip()
def plain_converter(decription_html:str):
from bs4 import BeautifulSoup
if decription_html is None:
return None
soup = BeautifulSoup(decription_html, "html.parser")
text = soup.get_text(separator=" ")
text = re.sub(r'\s+',' ',text)
return text.strip()
def extract_emails_from_text(text: str) -> list[str] | None: def extract_emails_from_text(text: str) -> list[str] | None:
if not text: if not text:

56
poetry.lock generated
View File

@@ -749,17 +749,6 @@ files = [
[package.extras] [package.extras]
all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
[[package]]
name = "iniconfig"
version = "2.0.0"
description = "brain-dead simple config-ini parsing"
optional = false
python-versions = ">=3.7"
files = [
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
]
[[package]] [[package]]
name = "ipykernel" name = "ipykernel"
version = "6.29.5" version = "6.29.5"
@@ -1229,13 +1218,13 @@ files = [
[[package]] [[package]]
name = "markdownify" name = "markdownify"
version = "0.13.1" version = "1.1.0"
description = "Convert HTML to markdown." description = "Convert HTML to markdown."
optional = false optional = false
python-versions = "*" python-versions = "*"
files = [ files = [
{file = "markdownify-0.13.1-py3-none-any.whl", hash = "sha256:1d181d43d20902bcc69d7be85b5316ed174d0dda72ff56e14ae4c95a4a407d22"}, {file = "markdownify-1.1.0-py3-none-any.whl", hash = "sha256:32a5a08e9af02c8a6528942224c91b933b4bd2c7d078f9012943776fc313eeef"},
{file = "markdownify-0.13.1.tar.gz", hash = "sha256:ab257f9e6bd4075118828a28c9d02f8a4bfeb7421f558834aa79b2dfeb32a098"}, {file = "markdownify-1.1.0.tar.gz", hash = "sha256:449c0bbbf1401c5112379619524f33b63490a8fa479456d41de9dc9e37560ebd"},
] ]
[package.dependencies] [package.dependencies]
@@ -1710,21 +1699,6 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
type = ["mypy (>=1.11.2)"] type = ["mypy (>=1.11.2)"]
[[package]]
name = "pluggy"
version = "1.5.0"
description = "plugin and hook calling mechanisms for python"
optional = false
python-versions = ">=3.8"
files = [
{file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
{file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
]
[package.extras]
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]] [[package]]
name = "pre-commit" name = "pre-commit"
version = "4.0.1" version = "4.0.1"
@@ -1975,28 +1949,6 @@ files = [
[package.extras] [package.extras]
windows-terminal = ["colorama (>=0.4.6)"] windows-terminal = ["colorama (>=0.4.6)"]
[[package]]
name = "pytest"
version = "7.4.4"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
{file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
]
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]] [[package]]
name = "python-dateutil" name = "python-dateutil"
version = "2.9.0.post0" version = "2.9.0.post0"
@@ -2869,4 +2821,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "57169347d2ce0ff19c4d3024ce000651bb3a816e36f454618f480741094fb4a7" content-hash = "6260adc8f96f6cf1ba4e2c23f05504c19e67140b9d346aed3d12eea6957b2104"

View File

@@ -21,10 +21,10 @@ python = "^3.10"
requests = "^2.31.0" requests = "^2.31.0"
beautifulsoup4 = "^4.12.2" beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0" pandas = "^2.1.0"
NUMPY = "1.26.3" numpy = ">=1.26.0"
pydantic = "^2.3.0" pydantic = "^2.3.0"
tls-client = "^1.0.1" tls-client = "^1.0.1"
markdownify = "^0.13.1" markdownify = "^1.1.0"
regex = "^2024.4.28" regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]