Source code for fakepilot.xray
"""
Defines how the data is scrapped from the Trustpilot site.
"""
# SPDX-License-Identifier: MIT
import re
from datetime import datetime
from bs4 import BeautifulSoup
try:
import lxml # pylint: disable=unused-import
PARSER = "lxml"
except ImportError:
PARSER = "html.parser"
[docs]
def extract_url(tag):
"""
Return the URL of the company.
Trustpilot uses the company registered URL to uniquely identify
a company. However, they aren't normalized. Sometimes they can be
``www.company-site.es`` or ``company-site.es``. URL of the company
as it is stored in Trustpilot.
"""
business_url = tag.find(class_=re.compile("styles_websiteUrl"))
return "".join(business_url.strings)
[docs]
def extract_company_name(tag):
"""Return the name of the company."""
return next(tag.find(class_=re.compile("title_displayName")).strings)
[docs]
def extract_rating_stats(tag):
"""
Extract the number of reviews and the TrustScore.
Both attributes are extracted simultaneously because they
are in the same tag.
"""
nreviews_tag = tag.find(attrs={"data-reviews-count-typography": "true"})
if not nreviews_tag:
raise RuntimeError(
"The tag where the score and the number of reviews "
"are hasn't been found."
)
nreviews = (
nreviews_tag.string.split()[0]
if nreviews_tag.string
else next(nreviews_tag.strings)
)
# The thousand separator is different for some countries
nreviews = re.sub(r"[.,\xa0]", "", nreviews)
score_tag = tag.find(attrs={"data-rating-typography": "true"})
score = score_tag.string.replace(",", ".")
return (int(nreviews), float(score))
[docs]
def extract_contact_info(tag):
"""
Extract the phone, address and email fields.
:return: A pair whose first element is the phone number, then
the email and finally the address.
"""
phone = email = address = None
# As the address field does not have a specific structure,
# the other two are searched and the last one would the
# address field
phone_re = re.compile(r"^\+?\d[\d-]+")
email_re = re.compile(
r"([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+"
)
contact_elements = tag.find_all(
"li", class_=re.compile("styles_contactInfoElement")
)
for contact_info in contact_elements:
line = ",".join(contact_info.strings)
if phone_re.search(line):
phone = line
elif email_re.search(line):
email = line
else:
address = line
return (phone, email, address)
[docs]
def extract_categories(tag):
"""Return the company's category list."""
cat_section = tag.find(class_=re.compile("styles_categoriesList"))
categories = None
if cat_section:
cat_refs = cat_section.find_all(href=re.compile("/categories/"))
categories = [cat_tag.string for cat_tag in cat_refs]
return categories
[docs]
def parse_page(page):
"""
Parse page with BeautifulSoup.
Set the ``lxml``'s parser if it is installed. If not,
the ``html.parser`` is used.
:param page: HTML document to be parsed.
:type page: str
:return: Parsed page with BeautifulSoup class.
:rtype: :class:`bs4.BeautifulSoup`
"""
return BeautifulSoup(
page,
PARSER,
from_encoding="utf-8",
)
[docs]
def extract_company_info(tag):
"""Extract the data of a company."""
try:
nreviews, score = extract_rating_stats(tag)
except RuntimeError:
score = nreviews = None
phone, email, address = extract_contact_info(tag)
return {
"name": extract_company_name(tag),
"url": extract_url(tag),
"nreviews": nreviews,
"score": score,
"categories": extract_categories(tag),
"email": email,
"phone": phone,
"address": address,
}
[docs]
def extract_author_name(tag):
"""Extract the review's author's name."""
consumer_node = tag.find(attrs={"data-consumer-name-typography": "true"})
if not consumer_node:
raise ValueError(
"""The tag where the author's name should be isn't
present."""
)
return consumer_node.string.title()
[docs]
def extract_author_id(tag):
"""Extract the review's author id."""
consumer_node = tag.find(attrs={"data-consumer-profile-link": "true"})
if not consumer_node:
raise ValueError(
"""The tag where the author's id should be isn't
present."""
)
return consumer_node.get("href").removeprefix("/users/")
[docs]
def extract_rating(tag):
"""Extract the rating in the review."""
star_rating_node = tag.find(class_=re.compile("star-rating"))
if not star_rating_node:
raise ValueError(
"""The tag where the review's rating should be isn't
present."""
)
return float(re.search(r"[0-5]", star_rating_node.img["alt"]).group())
[docs]
def extract_date(tag):
"""Extract the date the review was posted."""
date_node = tag.find(attrs={"data-service-review-date-time-ago": "true"})
if not date_node:
raise ValueError("The tag where the review's date should be isn't present.")
return datetime.fromisoformat(date_node["datetime"].split(".")[0])
[docs]
def extract_content(tag):
"""
Extract the content or body of the review.
It is returned in Unicode encoding.
"""
content_node = tag.find(attrs={"data-service-review-text-typography": "true"})
if not content_node:
content_node = tag.find(
"h2", attrs={"data-service-review-title-typography": "true"}
)
if not content_node:
raise ValueError(
"The tag where the review's content should be isn't present."
)
if content_node.string:
content = str(content_node.string)
else:
content = ""
for string in content_node.strings:
content += str(string)
return content
[docs]
def extract_review_info(tag):
"""Extract the review's data"""
return {
"author_name": extract_author_name(tag),
"author_id": extract_author_id(tag),
"star_rating": extract_rating(tag),
"date": extract_date(tag),
"content": extract_content(tag),
}