JobSpy/src/jobspy/scrapers/linkedin/__init__.py

from typing import Optional, Tuple
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from bs4.element import Tag

from .. import Scraper, ScraperInput, Site
from ...jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval


class LinkedInScraper(Scraper):
    def __init__(self):
        """
        Initializes LinkedInScraper with the LinkedIn job search url
        """
        site = Site(Site.LINKEDIN)
        url = "https://www.linkedin.com"
        super().__init__(site, url)

    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
        """
        Scrapes LinkedIn for jobs with scraper_input criteria
        :param scraper_input:
        :return: job_response
        """
        job_list: list[JobPost] = []
        seen_urls = set()
        page, processed_jobs, job_count = 0, 0, 0

        def job_type_code(job_type):
            mapping = {
                JobType.FULL_TIME: "F",
                JobType.PART_TIME: "P",
                JobType.INTERNSHIP: "I",
                JobType.CONTRACT: "C",
                JobType.TEMPORARY: "T",
            }

            return mapping.get(job_type, "")

        with requests.Session() as session:
            while len(job_list) < scraper_input.results_wanted:
                params = {
                    "keywords": scraper_input.search_term,
                    "location": scraper_input.location,
                    "distance": scraper_input.distance,
                    "f_WT": 2 if scraper_input.is_remote else None,
                    "f_JT": job_type_code(scraper_input.job_type)
                    if scraper_input.job_type
                    else None,
                    "pageNum": page,
                    "f_AL": "true" if scraper_input.easy_apply else None,
                }

                params = {k: v for k, v in params.items() if v is not None}
                response = session.get(
                    f"{self.url}/jobs/search", params=params, allow_redirects=True
                )

                if response.status_code != 200:
                    return JobResponse(
                        success=False,
                        error=f"Response returned {response.status_code}",
                    )

                soup = BeautifulSoup(response.text, "html.parser")

                if page == 0:
                    job_count_text = soup.find(
                        "span", class_="results-context-header__job-count"
                    ).text
                    job_count = int("".join(filter(str.isdigit, job_count_text)))

                for job_card in soup.find_all(
                    "div",
                    class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
                ):
                    processed_jobs += 1
                    data_entity_urn = job_card.get("data-entity-urn", "")
                    job_id = (
                        data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
                    )
                    job_url = f"{self.url}/jobs/view/{job_id}"
                    if job_url in seen_urls:
                        continue
                    seen_urls.add(job_url)
                    job_info = job_card.find("div", class_="base-search-card__info")
                    if job_info is None:
                        continue
                    title_tag = job_info.find("h3", class_="base-search-card__title")
                    title = title_tag.text.strip() if title_tag else "N/A"

                    company_tag = job_info.find("a", class_="hidden-nested-link")
                    company = company_tag.text.strip() if company_tag else "N/A"

                    metadata_card = job_info.find(
                        "div", class_="base-search-card__metadata"
                    )
                    location: Location = LinkedInScraper.get_location(metadata_card)

                    datetime_tag = metadata_card.find(
                        "time", class_="job-search-card__listdate"
                    )
                    description, job_type = LinkedInScraper.get_description(job_url)
                    if datetime_tag:
                        datetime_str = datetime_tag["datetime"]
                        date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
                    else:
                        date_posted = None

                    job_post = JobPost(
                        title=title,
                        description=description,
                        company_name=company,
                        location=location,
                        date_posted=date_posted,
                        job_url=job_url,
                        job_type=job_type,
                        compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD")
                    )
                    job_list.append(job_post)
                    if (
                        len(job_list) >= scraper_input.results_wanted
                        or processed_jobs >= job_count
                    ):
                        break
                if (
                    len(job_list) >= scraper_input.results_wanted
                    or processed_jobs >= job_count
                ):
                    break

                page += 1

        job_list = job_list[: scraper_input.results_wanted]
        job_response = JobResponse(
            success=True,
            jobs=job_list,
            total_results=job_count,
        )
        return job_response

    @staticmethod
    def get_description(job_page_url: str) -> Optional[str]:
        """
        Retrieves job description by going to the job page url
        :param job_page_url:
        :return: description or None
        """
        response = requests.get(job_page_url, allow_redirects=True)
        if response.status_code not in range(200, 400):
            return None, None

        soup = BeautifulSoup(response.text, "html.parser")
        div_content = soup.find(
            "div", class_=lambda x: x and "show-more-less-html__markup" in x
        )

        text_content = None
        if div_content:
            text_content = " ".join(div_content.get_text().split()).strip()

        def get_job_type(
            soup: BeautifulSoup,
        ) -> Tuple[Optional[str], Optional[JobType]]:
            """
            Gets the job type from job page
            :param soup:
            :return: JobType
            """
            h3_tag = soup.find(
                "h3",
                class_="description__job-criteria-subheader",
                string=lambda text: "Employment type" in text,
            )

            employment_type = None
            if h3_tag:
                employment_type_span = h3_tag.find_next_sibling(
                    "span",
                    class_="description__job-criteria-text description__job-criteria-text--criteria",
                )
                if employment_type_span:
                    employment_type = employment_type_span.get_text(strip=True)
                    employment_type = employment_type.lower()
                    employment_type = employment_type.replace("-", "")

            return JobType(employment_type)

        return text_content, get_job_type(soup)

    @staticmethod
    def get_location(metadata_card: Optional[Tag]) -> Location:
        """
        Extracts the location data from the job metadata card.
        :param metadata_card
        :return: location
        """
        if metadata_card is not None:
            location_tag = metadata_card.find(
                "span", class_="job-search-card__location"
            )
            location_string = location_tag.text.strip() if location_tag else "N/A"
            parts = location_string.split(", ")
            if len(parts) == 2:
                city, state = parts
                location = Location(
                    city=city,
                    state=state,
                )

        return location
Linkedin job type (#30) 2023-08-31 12:01:47 -07:00			`from typing import Optional, Tuple`
remove duplicates - gsheets (#29) 2023-08-31 08:29:43 -07:00			`from datetime import datetime`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00
			`import requests`
			`from bs4 import BeautifulSoup`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`from bs4.element import Tag`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`from .. import Scraper, ScraperInput, Site`
			`from ...jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00

			`class LinkedInScraper(Scraper):`
			`def __init__(self):`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`"""`
			`Initializes LinkedInScraper with the LinkedIn job search url`
			`"""`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00			`site = Site(Site.LINKEDIN)`
fix(indeed): fetch full description 2023-08-26 03:55:59 -07:00			`url = "https://www.linkedin.com"`
			`super().__init__(site, url)`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00
			`def scrape(self, scraper_input: ScraperInput) -> JobResponse:`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`"""`
			`Scrapes LinkedIn for jobs with scraper_input criteria`
			`:param scraper_input:`
			`:return: job_response`
			`"""`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00			`job_list: list[JobPost] = []`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`seen_urls = set()`
			`page, processed_jobs, job_count = 0, 0, 0`

feat(jobs): add site_type param 2023-07-11 06:24:59 -07:00			`def job_type_code(job_type):`
			`mapping = {`
			`JobType.FULL_TIME: "F",`
			`JobType.PART_TIME: "P",`
			`JobType.INTERNSHIP: "I",`
			`JobType.CONTRACT: "C",`
			`JobType.TEMPORARY: "T",`
			`}`

			`return mapping.get(job_type, "")`

feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`with requests.Session() as session:`
			`while len(job_list) < scraper_input.results_wanted:`
			`params = {`
feat(jobs): filter by is_remote 2023-07-11 03:42:20 -07:00			`"keywords": scraper_input.search_term,`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`"location": scraper_input.location,`
			`"distance": scraper_input.distance,`
feat(jobs): filter by is_remote 2023-07-11 03:42:20 -07:00			`"f_WT": 2 if scraper_input.is_remote else None,`
feat(jobs): add site_type param 2023-07-11 06:24:59 -07:00			`"f_JT": job_type_code(scraper_input.job_type)`
			`if scraper_input.job_type`
			`else None,`
feat(jobs): filter by is_remote 2023-07-11 03:42:20 -07:00			`"pageNum": page,`
feat(linkedin): add easy apply attr 2023-08-17 13:44:52 -07:00			`"f_AL": "true" if scraper_input.easy_apply else None,`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`}`

chore: clean up 2023-08-19 16:46:03 -07:00			`params = {k: v for k, v in params.items() if v is not None}`
fix(indeed): fetch full description 2023-08-26 03:55:59 -07:00			`response = session.get(`
			`f"{self.url}/jobs/search", params=params, allow_redirects=True`
			`)`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`if response.status_code != 200:`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`return JobResponse(`
			`success=False,`
			`error=f"Response returned {response.status_code}",`
			`)`

			`soup = BeautifulSoup(response.text, "html.parser")`

			`if page == 0:`
			`job_count_text = soup.find(`
			`"span", class_="results-context-header__job-count"`
			`).text`
			`job_count = int("".join(filter(str.isdigit, job_count_text)))`

			`for job_card in soup.find_all(`
			`"div",`
			`class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",`
			`):`
fix(jobs): add 1 to loop counter 2023-07-11 06:51:19 -07:00			`processed_jobs += 1`
feat(jobs): filter by is_remote 2023-07-11 03:42:20 -07:00			`data_entity_urn = job_card.get("data-entity-urn", "")`
			`job_id = (`
			`data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"`
			`)`
fix(indeed): fetch full description 2023-08-26 03:55:59 -07:00			`job_url = f"{self.url}/jobs/view/{job_id}"`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`if job_url in seen_urls:`
			`continue`
			`seen_urls.add(job_url)`
			`job_info = job_card.find("div", class_="base-search-card__info")`
			`if job_info is None:`
			`continue`
			`title_tag = job_info.find("h3", class_="base-search-card__title")`
			`title = title_tag.text.strip() if title_tag else "N/A"`

			`company_tag = job_info.find("a", class_="hidden-nested-link")`
			`company = company_tag.text.strip() if company_tag else "N/A"`

			`metadata_card = job_info.find(`
			`"div", class_="base-search-card__metadata"`
			`)`
			`location: Location = LinkedInScraper.get_location(metadata_card)`

			`datetime_tag = metadata_card.find(`
			`"time", class_="job-search-card__listdate"`
			`)`
Linkedin job type (#30) 2023-08-31 12:01:47 -07:00			`description, job_type = LinkedInScraper.get_description(job_url)`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`if datetime_tag:`
			`datetime_str = datetime_tag["datetime"]`
			`date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")`
			`else:`
			`date_posted = None`

			`job_post = JobPost(`
			`title=title,`
fix(linkedin): fetch full description 2023-08-26 05:07:29 -07:00			`description=description,`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`company_name=company,`
			`location=location,`
			`date_posted=date_posted,`
feat(jobs): filter by is_remote 2023-07-11 03:42:20 -07:00			`job_url=job_url,`
Linkedin job type (#30) 2023-08-31 12:01:47 -07:00			`job_type=job_type,`
Library Migration (#31) 2023-09-03 07:29:25 -07:00			`compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD")`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`)`
			`job_list.append(job_post)`
fix(jobs): add 1 to loop counter 2023-07-11 06:51:19 -07:00			`if (`
			`len(job_list) >= scraper_input.results_wanted`
			`or processed_jobs >= job_count`
			`):`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`break`
			`if (`
			`len(job_list) >= scraper_input.results_wanted`
			`or processed_jobs >= job_count`
			`):`
			`break`

			`page += 1`

			`job_list = job_list[: scraper_input.results_wanted]`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00			`job_response = JobResponse(`
- linkedin refactor 2023-07-10 15:47:22 -07:00			`success=True,`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00			`jobs=job_list,`
- linkedin schema migration 2023-07-11 10:02:50 -07:00			`total_results=job_count,`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00			`)`
			`return job_response`

fix(linkedin): fetch full description 2023-08-26 05:07:29 -07:00			`@staticmethod`
			`def get_description(job_page_url: str) -> Optional[str]:`
			`"""`
			`Retrieves job description by going to the job page url`
			`:param job_page_url:`
			`:return: description or None`
			`"""`
			`response = requests.get(job_page_url, allow_redirects=True)`
			`if response.status_code not in range(200, 400):`
Linkedin job type (#30) 2023-08-31 12:01:47 -07:00			`return None, None`
fix(linkedin): fetch full description 2023-08-26 05:07:29 -07:00
			`soup = BeautifulSoup(response.text, "html.parser")`
			`div_content = soup.find(`
			`"div", class_=lambda x: x and "show-more-less-html__markup" in x`
			`)`

			`text_content = None`
			`if div_content:`
			`text_content = " ".join(div_content.get_text().split()).strip()`
Linkedin job type (#30) 2023-08-31 12:01:47 -07:00
			`def get_job_type(`
			`soup: BeautifulSoup,`
			`) -> Tuple[Optional[str], Optional[JobType]]:`
			`"""`
			`Gets the job type from job page`
			`:param soup:`
			`:return: JobType`
			`"""`
			`h3_tag = soup.find(`
			`"h3",`
			`class_="description__job-criteria-subheader",`
			`string=lambda text: "Employment type" in text,`
			`)`

			`employment_type = None`
			`if h3_tag:`
			`employment_type_span = h3_tag.find_next_sibling(`
			`"span",`
			`class_="description__job-criteria-text description__job-criteria-text--criteria",`
			`)`
			`if employment_type_span:`
			`employment_type = employment_type_span.get_text(strip=True)`
			`employment_type = employment_type.lower()`
			`employment_type = employment_type.replace("-", "")`

			`return JobType(employment_type)`

			`return text_content, get_job_type(soup)`
fix(linkedin): fetch full description 2023-08-26 05:07:29 -07:00
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00			`@staticmethod`
feat(jobs): remove pages for results_wanted 2023-07-10 20:07:19 -07:00			`def get_location(metadata_card: Optional[Tag]) -> Location:`
			`"""`
			`Extracts the location data from the job metadata card.`
			`:param metadata_card`
			`:return: location`
			`"""`
feat: add LinkedIn scraper 2023-07-08 07:34:55 -07:00			`if metadata_card is not None:`
			`location_tag = metadata_card.find(`
			`"span", class_="job-search-card__location"`
			`)`
			`location_string = location_tag.text.strip() if location_tag else "N/A"`
			`parts = location_string.split(", ")`
			`if len(parts) == 2:`
			`city, state = parts`
			`location = Location(`
			`city=city,`
			`state=state,`
			`)`

			`return location`