Source code for fakepilot.xray

"""
Defines how the data is scrapped from the Trustpilot site.
"""

# SPDX-License-Identifier: MIT

import re
import datetime
from functools import reduce
import operator

from bs4 import BeautifulSoup

try:
    import lxml  # pylint: disable=unused-import

    PARSER = "lxml"
except ImportError:
    PARSER = "html.parser"


[docs] def has_attr(attr_name): """Return a function that checks if a tag has an attribute.""" return lambda tag: tag.has_attr(attr_name)
[docs] def extract_url(tag): """ Return the URL of the company. Trustpilot uses the company registered URL to uniquely identify a company. However, they aren't normalized. Sometimes they can be ``www.company-site.es`` or ``company-site.es``. URL of the company as it is stored in Trustpilot. """ # For May 2025 pages business_url = tag.find(class_=re.compile("link_internal")) return "".join(business_url.strings)
[docs] def extract_company_name(tag): """Return the name of the company.""" return next(tag.find(class_=re.compile("title_displayName")).strings)
[docs] def extract_rating_stats(tag): """ Extract the number of reviews and the TrustScore. Both attributes are extracted simultaneously because they are in the same tag. """ nreviews_tag = tag.find(attrs={"data-reviews-count-typography": "true"}) if not nreviews_tag: raise RuntimeError( "The tag where the score and the number of reviews are hasn't been found." ) nreviews = ( nreviews_tag.string.split()[0] if nreviews_tag.string else next(nreviews_tag.strings) ) # The thousand separator is different for some countries nreviews = re.sub(r"[.,\xa0]", "", nreviews) score_tag = tag.find(attrs={"data-rating-typography": "true"}) score = score_tag.string.replace(",", ".") return (int(nreviews), float(score))
[docs] def extract_contact_info(tag): """ Extract the phone, address and email fields. :return: A pair whose first element is the phone number, then the email and finally the address. """ phone = email = address = None # As the address field does not have a specific structure, # the other two are searched and the last one would the # address field phone_re = re.compile(r"^\+?\d[\d-]+") email_re = re.compile( r"([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+" ) # For May 2025 pages contact_elements = tag.find_all("li", class_=re.compile("styles_itemRow")) # For December 2023 pages if not contact_elements: contact_elements = tag.find_all( "li", class_=re.compile("styles_contactInfoElement") ) else: # On modern pages the last element is the company's URL, # so we ned to remove it from the contact element list. contact_elements = contact_elements[:-1] for contact_info in contact_elements: line = ",".join(contact_info.strings) if phone_re.search(line): phone = line elif email_re.search(line): email = line else: address = line return (phone, email, address)
[docs] def extract_categories(tag): """ Return the company's category list. """ cat_refs = tag.find_all(has_attr("data-business-unit-info-category-typography")) categories = [cat_tag.string for cat_tag in cat_refs] return categories
[docs] def extract_is_claimed(tag): """ Indicate if the Trustpilot company's page is claimed by the company. """ claimed_tag = tag.find(string=re.compile("Claimed profile")) return bool(claimed_tag)
[docs] def extract_percentage_stars(tag): """ Extract the percentage of reviews that the company has received for each rating (1 star, 2 stars, etc.). """ rating_dist_str = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5} rating_dist = dict( zip(range(1, len(rating_dist_str.keys()) + 1), [None] * len(rating_dist_str)) ) # The rating distribution information is in a side panel. Also, # there are other tags in the page with the attributes data-star-rating, # so that's why we need to get first the side panel side_info_tag = tag.find(class_=re.compile("styles_businessInfoSideBar")) if side_info_tag: for number_stars_str, nstars in rating_dist_str.items(): rating_tag = side_info_tag.find( attrs={"data-star-rating": number_stars_str} ) if rating_tag: bar_tag = rating_tag.find( class_=re.compile("rating-distribution-row_barValue") ) percentage = bar_tag.attrs["style"].split(":")[-1].rstrip("%") rating_dist[nstars] = float(percentage) if any(rating_dist): return rating_dist return None
[docs] def parse_page(page): """ Parse page with BeautifulSoup. Set the ``lxml``'s parser if it is installed. If not, the ``html.parser`` is used. :param page: HTML document to be parsed. :type page: str :return: Parsed page with BeautifulSoup class. :rtype: :class:`bs4.BeautifulSoup` """ return BeautifulSoup(page, PARSER)
[docs] def extract_company_info(tag): """Extract the data of a company.""" try: nreviews, score = extract_rating_stats(tag) # On old Trsutpilot pages, if the company closed # then the company's page does not show the score or # number of reviews except RuntimeError: score = nreviews = None phone, email, address = extract_contact_info(tag) return { "name": extract_company_name(tag), "url": extract_url(tag), "nreviews": nreviews, "score": score, "categories": extract_categories(tag), "email": email, "phone": phone, "address": address, "is_claimed": extract_is_claimed(tag), "rating_distribution": extract_percentage_stars(tag), }
[docs] def extract_review_author_name(tag): """Extract the review's author's name.""" consumer_node = tag.find(attrs={"data-consumer-name-typography": "true"}) return consumer_node.string
[docs] def extract_review_author_id(tag): """Extract the review's author id.""" consumer_node = tag.find(attrs={"data-consumer-profile-link": "true"}) # The author link is https://www.trustpilot.com/users/66642b4....954121bbb4cc643 return consumer_node.get("href").rsplit("/", 1)[-1]
[docs] def extract_review_rating(tag): """Extract the rating in the review.""" attr_name = "data-service-review-rating" star_rating_node = tag.find(has_attr(attr_name)) return float(star_rating_node.attrs[attr_name])
[docs] def extract_review_date(tag): """Extract the date the review was posted.""" date_node = tag.find(attrs={"data-service-review-date-time-ago": "true"}) return datetime.datetime.strptime(date_node["datetime"], "%Y-%m-%dT%H:%M:%S.%fZ")
[docs] def extract_review_title(tag): """Extract the title of the review.""" title_node = tag.find(has_attr("data-service-review-title-typography")) return title_node.string.strip()
[docs] def concat_strings(node): """ Concatenate the strings contained in ``node`` as a unique and complete string. We need to check if there is just one or more strings. In case of the latter, then we need to concatenate them. See https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string """ if node.string: concat_string = str(node.string) else: concat_string = reduce(operator.add, node.strings) return concat_string
[docs] def extract_review_content(tag): """ Extract the content or body of the review. It is returned in Unicode encoding. """ content_node = tag.find(attrs={"data-service-review-text-typography": "true"}) if not content_node: content = "" else: content = concat_strings(content_node) content = content.replace("\n", "").strip() return content
[docs] def extract_number_reviews_author(tag): """ Extract the number of reviews made by the author of the current review. """ attr = "data-consumer-reviews-count" nreviews_node = tag.find(has_attr(attr)) return int(nreviews_node.attrs[attr])
[docs] def extract_authors_country(tag): """ Extract the country where the author is from. """ country_node = tag.find(attrs={"data-consumer-country-typography": "true"}) country = concat_strings(country_node) return country
[docs] def extract_date_experience(tag): """ Extract the date of experience of the review. """ exp_node = tag.find( attrs={"data-service-review-date-of-experience-typography": "true"} ) exp_date_str = concat_strings(exp_node) exp_date_str = exp_date_str.split(":")[-1].strip() return datetime.datetime.strptime(exp_date_str, "%B %d, %Y")
[docs] def extract_is_verified(tag): """ Extract if the review is verified. """ ver_node = tag.find(attrs={"data-review-label-tooltip-trigger-typography": "true"}) return bool(ver_node)
[docs] def extract_review_info(tag): """Extract the review's data""" return { "author_name": extract_review_author_name(tag), "author_id": extract_review_author_id(tag), "is_verified": extract_is_verified(tag), "star_rating": extract_review_rating(tag), "date": extract_review_date(tag), "title": extract_review_title(tag), "content": extract_review_content(tag), "nreviews": extract_number_reviews_author(tag), "country": extract_authors_country(tag), "date_experience": extract_date_experience(tag), }