Source code for fakepilot.xray
"""
Defines how the data is scrapped from the Trustpilot site.
"""
# SPDX-License-Identifier: MIT
import re
import datetime
from functools import reduce
import operator
from bs4 import BeautifulSoup
try:
import lxml # pylint: disable=unused-import
PARSER = "lxml"
except ImportError:
PARSER = "html.parser"
[docs]
def has_attr(attr_name):
"""Return a function that checks if a tag has an attribute."""
return lambda tag: tag.has_attr(attr_name)
[docs]
def extract_url(tag):
"""
Return the URL of the company.
Trustpilot uses the company registered URL to uniquely identify
a company. However, they aren't normalized. Sometimes they can be
``www.company-site.es`` or ``company-site.es``. URL of the company
as it is stored in Trustpilot.
"""
# For May 2025 pages
business_url = tag.find(class_=re.compile("link_internal"))
return "".join(business_url.strings)
[docs]
def extract_company_name(tag):
"""Return the name of the company."""
return next(tag.find(class_=re.compile("title_displayName")).strings)
[docs]
def extract_rating_stats(tag):
"""
Extract the number of reviews and the TrustScore.
Both attributes are extracted simultaneously because they
are in the same tag.
"""
nreviews_tag = tag.find(attrs={"data-reviews-count-typography": "true"})
if not nreviews_tag:
raise RuntimeError(
"The tag where the score and the number of reviews are hasn't been found."
)
nreviews = (
nreviews_tag.string.split()[0]
if nreviews_tag.string
else next(nreviews_tag.strings)
)
# The thousand separator is different for some countries
nreviews = re.sub(r"[.,\xa0]", "", nreviews)
score_tag = tag.find(attrs={"data-rating-typography": "true"})
score = score_tag.string.replace(",", ".")
return (int(nreviews), float(score))
[docs]
def extract_contact_info(tag):
"""
Extract the phone, address and email fields.
:return: A pair whose first element is the phone number, then
the email and finally the address.
"""
phone = email = address = None
# As the address field does not have a specific structure,
# the other two are searched and the last one would the
# address field
phone_re = re.compile(r"^\+?\d[\d-]+")
email_re = re.compile(
r"([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+"
)
# For May 2025 pages
contact_elements = tag.find_all("li", class_=re.compile("styles_itemRow"))
# For December 2023 pages
if not contact_elements:
contact_elements = tag.find_all(
"li", class_=re.compile("styles_contactInfoElement")
)
else:
# On modern pages the last element is the company's URL,
# so we ned to remove it from the contact element list.
contact_elements = contact_elements[:-1]
for contact_info in contact_elements:
line = ",".join(contact_info.strings)
if phone_re.search(line):
phone = line
elif email_re.search(line):
email = line
else:
address = line
return (phone, email, address)
[docs]
def extract_categories(tag):
"""
Return the company's category list.
"""
cat_refs = tag.find_all(has_attr("data-business-unit-info-category-typography"))
categories = [cat_tag.string for cat_tag in cat_refs]
return categories
[docs]
def extract_is_claimed(tag):
"""
Indicate if the Trustpilot company's page is claimed by the company.
"""
claimed_tag = tag.find(string=re.compile("Claimed profile"))
return bool(claimed_tag)
[docs]
def extract_percentage_stars(tag):
"""
Extract the percentage of reviews that the company has received for each
rating (1 star, 2 stars, etc.).
"""
rating_dist_str = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}
rating_dist = dict(
zip(range(1, len(rating_dist_str.keys()) + 1), [None] * len(rating_dist_str))
)
# The rating distribution information is in a side panel. Also,
# there are other tags in the page with the attributes data-star-rating,
# so that's why we need to get first the side panel
side_info_tag = tag.find(class_=re.compile("styles_businessInfoSideBar"))
if side_info_tag:
for number_stars_str, nstars in rating_dist_str.items():
rating_tag = side_info_tag.find(
attrs={"data-star-rating": number_stars_str}
)
if rating_tag:
bar_tag = rating_tag.find(
class_=re.compile("rating-distribution-row_barValue")
)
percentage = bar_tag.attrs["style"].split(":")[-1].rstrip("%")
rating_dist[nstars] = float(percentage)
if any(rating_dist):
return rating_dist
return None
[docs]
def parse_page(page):
"""
Parse page with BeautifulSoup.
Set the ``lxml``'s parser if it is installed. If not, the ``html.parser``
is used.
:param page: HTML document to be parsed.
:type page: str
:return: Parsed page with BeautifulSoup class.
:rtype: :class:`bs4.BeautifulSoup`
"""
return BeautifulSoup(page, PARSER)
[docs]
def extract_company_info(tag):
"""Extract the data of a company."""
try:
nreviews, score = extract_rating_stats(tag)
# On old Trsutpilot pages, if the company closed
# then the company's page does not show the score or
# number of reviews
except RuntimeError:
score = nreviews = None
phone, email, address = extract_contact_info(tag)
return {
"name": extract_company_name(tag),
"url": extract_url(tag),
"nreviews": nreviews,
"score": score,
"categories": extract_categories(tag),
"email": email,
"phone": phone,
"address": address,
"is_claimed": extract_is_claimed(tag),
"rating_distribution": extract_percentage_stars(tag),
}
[docs]
def extract_review_author_name(tag):
"""Extract the review's author's name."""
consumer_node = tag.find(attrs={"data-consumer-name-typography": "true"})
return consumer_node.string
[docs]
def extract_review_author_id(tag):
"""Extract the review's author id."""
consumer_node = tag.find(attrs={"data-consumer-profile-link": "true"})
# The author link is https://www.trustpilot.com/users/66642b4....954121bbb4cc643
return consumer_node.get("href").rsplit("/", 1)[-1]
[docs]
def extract_review_rating(tag):
"""Extract the rating in the review."""
attr_name = "data-service-review-rating"
star_rating_node = tag.find(has_attr(attr_name))
return float(star_rating_node.attrs[attr_name])
[docs]
def extract_review_date(tag):
"""Extract the date the review was posted."""
date_node = tag.find(attrs={"data-service-review-date-time-ago": "true"})
return datetime.datetime.strptime(date_node["datetime"], "%Y-%m-%dT%H:%M:%S.%fZ")
[docs]
def extract_review_title(tag):
"""Extract the title of the review."""
title_node = tag.find(has_attr("data-service-review-title-typography"))
return title_node.string.strip()
[docs]
def concat_strings(node):
"""
Concatenate the strings contained in ``node`` as a unique and complete
string.
We need to check if there is just one or more strings. In case of the
latter, then we need to concatenate them.
See https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string
"""
if node.string:
concat_string = str(node.string)
else:
concat_string = reduce(operator.add, node.strings)
return concat_string
[docs]
def extract_review_content(tag):
"""
Extract the content or body of the review.
It is returned in Unicode encoding.
"""
content_node = tag.find(attrs={"data-service-review-text-typography": "true"})
if not content_node:
content = ""
else:
content = concat_strings(content_node)
content = content.replace("\n", "").strip()
return content
[docs]
def extract_number_reviews_author(tag):
"""
Extract the number of reviews made by the author of the current review.
"""
attr = "data-consumer-reviews-count"
nreviews_node = tag.find(has_attr(attr))
return int(nreviews_node.attrs[attr])
[docs]
def extract_authors_country(tag):
"""
Extract the country where the author is from.
"""
country_node = tag.find(attrs={"data-consumer-country-typography": "true"})
country = concat_strings(country_node)
return country
[docs]
def extract_date_experience(tag):
"""
Extract the date of experience of the review.
"""
exp_node = tag.find(
attrs={"data-service-review-date-of-experience-typography": "true"}
)
exp_date_str = concat_strings(exp_node)
exp_date_str = exp_date_str.split(":")[-1].strip()
return datetime.datetime.strptime(exp_date_str, "%B %d, %Y")
[docs]
def extract_is_verified(tag):
"""
Extract if the review is verified.
"""
ver_node = tag.find(attrs={"data-review-label-tooltip-trigger-typography": "true"})
return bool(ver_node)
[docs]
def extract_review_info(tag):
"""Extract the review's data"""
return {
"author_name": extract_review_author_name(tag),
"author_id": extract_review_author_id(tag),
"is_verified": extract_is_verified(tag),
"star_rating": extract_review_rating(tag),
"date": extract_review_date(tag),
"title": extract_review_title(tag),
"content": extract_review_content(tag),
"nreviews": extract_number_reviews_author(tag),
"country": extract_authors_country(tag),
"date_experience": extract_date_experience(tag),
}