Source code for fakepilot.xray

"""
Defines how the data is scrapped from the Trustpilot site.
"""

# SPDX-License-Identifier: MIT

import re
from datetime import datetime

from bs4 import BeautifulSoup

try:
    import lxml  # pylint: disable=unused-import

    PARSER = "lxml"
except ImportError:
    PARSER = "html.parser"



[docs]
def extract_url(tag):
    """
    Return the URL of the company.

    Trustpilot uses the company registered URL to uniquely identify
    a company. However, they aren't normalized. Sometimes they can be
    ``www.company-site.es`` or ``company-site.es``. URL of the company
    as it is stored in Trustpilot.
    """

    business_url = tag.find(class_=re.compile("styles_websiteUrl"))
    return "".join(business_url.strings)




[docs]
def extract_company_name(tag):
    """Return the name of the company."""
    return next(tag.find(class_=re.compile("title_displayName")).strings)




[docs]
def extract_rating_stats(tag):
    """
    Extract the number of reviews and the TrustScore.

    Both attributes are extracted simultaneously because they
    are in the same tag.
    """

    nreviews_tag = tag.find(attrs={"data-reviews-count-typography": "true"})

    if not nreviews_tag:
        raise RuntimeError(
            "The tag where the score and the number of reviews "
            "are hasn't been found."
        )

    nreviews = (
        nreviews_tag.string.split()[0]
        if nreviews_tag.string
        else next(nreviews_tag.strings)
    )

    # The thousand separator is different for some countries
    nreviews = re.sub(r"[.,\xa0]", "", nreviews)
    score_tag = tag.find(attrs={"data-rating-typography": "true"})
    score = score_tag.string.replace(",", ".")
    return (int(nreviews), float(score))




[docs]
def extract_contact_info(tag):
    """
    Extract the phone, address and email fields.

    :return: A pair whose first element is the phone number, then
             the email and finally the address.
    """

    phone = email = address = None

    # As the address field does not have a specific structure,
    # the other two are searched and the last one would the
    # address field
    phone_re = re.compile(r"^\+?\d[\d-]+")
    email_re = re.compile(
        r"([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+"
    )

    contact_elements = tag.find_all(
        "li", class_=re.compile("styles_contactInfoElement")
    )

    for contact_info in contact_elements:

        line = ",".join(contact_info.strings)

        if phone_re.search(line):
            phone = line
        elif email_re.search(line):
            email = line
        else:
            address = line

    return (phone, email, address)




[docs]
def extract_categories(tag):
    """Return the company's category list."""
    cat_section = tag.find(class_=re.compile("styles_categoriesList"))
    categories = None

    if cat_section:
        cat_refs = cat_section.find_all(href=re.compile("/categories/"))
        categories = [cat_tag.string for cat_tag in cat_refs]

    return categories




[docs]
def parse_page(page):
    """
    Parse page with BeautifulSoup.

    Set the ``lxml``'s parser if it is installed. If not,
    the ``html.parser`` is used.

    :param page: HTML document to be parsed.
    :type page: str
    :return: Parsed page with BeautifulSoup class.
    :rtype: :class:`bs4.BeautifulSoup`
    """

    return BeautifulSoup(
        page,
        PARSER,
        from_encoding="utf-8",
    )




[docs]
def extract_company_info(tag):
    """Extract the data of a company."""
    try:
        nreviews, score = extract_rating_stats(tag)
    except RuntimeError:
        score = nreviews = None

    phone, email, address = extract_contact_info(tag)

    return {
        "name": extract_company_name(tag),
        "url": extract_url(tag),
        "nreviews": nreviews,
        "score": score,
        "categories": extract_categories(tag),
        "email": email,
        "phone": phone,
        "address": address,
    }




[docs]
def extract_author_name(tag):
    """Extract the review's author's name."""
    consumer_node = tag.find(attrs={"data-consumer-name-typography": "true"})

    if not consumer_node:
        raise ValueError(
            """The tag where the author's name should be isn't
            present."""
        )

    return consumer_node.string.title()




[docs]
def extract_author_id(tag):
    """Extract the review's author id."""
    consumer_node = tag.find(attrs={"data-consumer-profile-link": "true"})

    if not consumer_node:
        raise ValueError(
            """The tag where the author's id should be isn't
            present."""
        )

    return consumer_node.get("href").removeprefix("/users/")




[docs]
def extract_rating(tag):
    """Extract the rating in the review."""
    star_rating_node = tag.find(class_=re.compile("star-rating"))

    if not star_rating_node:
        raise ValueError(
            """The tag where the review's rating should be isn't
            present."""
        )

    return float(re.search(r"[0-5]", star_rating_node.img["alt"]).group())




[docs]
def extract_date(tag):
    """Extract the date the review was posted."""
    date_node = tag.find(attrs={"data-service-review-date-time-ago": "true"})

    if not date_node:
        raise ValueError("The tag where the review's date should be isn't present.")

    return datetime.fromisoformat(date_node["datetime"].split(".")[0])




[docs]
def extract_content(tag):
    """
    Extract the content or body of the review.

    It is returned in Unicode encoding.
    """
    content_node = tag.find(attrs={"data-service-review-text-typography": "true"})

    if not content_node:

        content_node = tag.find(
            "h2", attrs={"data-service-review-title-typography": "true"}
        )

        if not content_node:
            raise ValueError(
                "The tag where the review's content should be isn't present."
            )

    if content_node.string:
        content = str(content_node.string)
    else:
        content = ""
        for string in content_node.strings:
            content += str(string)

    return content




[docs]
def extract_review_info(tag):
    """Extract the review's data"""
    return {
        "author_name": extract_author_name(tag),
        "author_id": extract_author_id(tag),
        "star_rating": extract_rating(tag),
        "date": extract_date(tag),
        "content": extract_content(tag),
    }