Source code for fakepilot

"""Trustpilot scrapping Python package"""

# SPDX-License-Identifier: MIT

import re

from . import xray


[docs] def get_reviews(company_page, nreviews): """ Get the reviews' data included in a company's Trustpilot page. The number of extracted reviews is the minimum of `nreviews` and the number of reviews in the company's page. :param company_page: HTML company's page where the reviews are extracted from. :type company_page: :class:`bs4.BeautifulSoup` :param nreviews: Number of reviews to be extracted. :type nreviews: int :return: Reviews of a company. :rtype: list(dict(str,)) """ def has_attr_data_service_review_card_paper(tag): """ Check if ``tag`` has the attribute ``'data-service-review-card-paper'``. """ return tag.has_attr("data-service-review-card-paper") reviews_section = company_page.find(class_=re.compile("styles_reviewListContainer")) # For 2023 pages if not reviews_section: reviews_section = company_page review_tags = reviews_section.find_all( has_attr_data_service_review_card_paper, limit=nreviews ) reviews = [xray.extract_review_info(tag) for tag in review_tags] return reviews
[docs] def extract_info(file, with_reviews=False, nreviews=5): """ Return the information of a company page. :param file: Company's page of Trustpilot. :type file: file object :param with_reviews: Indicates whether the company's reviews are extracted. :type with_reviews: bool, optional :param nreviews: Number of reviews to be extracted. Ignored if `with_reviews` is ``False``. :type nreviews: int, optional :return: Company's information: name (``'name'``), URL (``'url'``), number of reviews in Trustpilot (``'nreviews'``), score (``'address'``) and if the company's profile is claimed (``'is_claimed'``) by the company. The categories, email (``'email'``), phone number (``'phone'``), address (``'address'``) and rating distribution (``'rating_distribution'``) are also included if they are on the page. In case of the reviews, which are included under the key ``'reviews'``, for each one the returned values are the author's name (``'author_name'``), the author's id (``'author_id'``) in Trustpilot, rating (``'star_rating'``), date of publication (``'date'``), the text content (``'content'``), the number of reviews made by the author of the review (``'nreviews'``), the country that the author is from (``'country'``), the date of experience (``'date_experience'``) and if the review is verified (``'is_verified'``). :rtype: dict(str, ) """ company_page = xray.parse_page(file) company = xray.extract_company_info(company_page) if with_reviews: company["reviews"] = get_reviews(company_page, nreviews) return company