Source code for zyte_parsers.star_rating

import copy
import re
from typing import List, Optional, Set
from urllib.parse import urlparse

from lxml.etree import strip_attributes
from lxml.html import HtmlElement, tostring

from .api import SelectorOrElement, input_to_element

# this is by far the most common, although 10 is also possible
# Some code below assumes it's 5 (with asserts in place).
BEST_RATING = 5

OF_STAR_PATTERNS = [
    r"^(\d+\.?\d*) stars",
    r"^(\d+\.?\d*) (out )?of 5 stars",
    r"^rated (\d+\.?\d*) (out )?of 5\b",
    r"\b(\d+\.?\d*) (out )?of 5\b",
    r"^(\d+\.?\d*)$",
]


[docs]def extract_rating_stars(node: SelectorOrElement) -> Optional[float]:
    """Extract a rating value from a node containing rating stars.

    :param node: Node that includes the rating stars.
    :return: Rating value as a float or None.
    """
    node = input_to_element(node)
    if any(_extract_rating_stars_nodes_quick_check(subnode) for subnode in node.iter()):
        node = copy.deepcopy(node)
        strip_attributes(node, "ng-class")

    extractions: Set[float] = set()
    for subnode in node.iter():
        extractions.update(
            extractor(subnode)  # type: ignore[misc]
            for extractor in [
                _extract_rating_stars_attrib,
                _extract_rating_stars_img,
                _extract_rating_stars_class,
                _extract_rating_stars_nodes,
                _extract_rating_stars_style_width,
            ]
        )
    extractions = {
        value
        for value in extractions
        if value is not None and 1 <= value <= BEST_RATING
    }

    if len(extractions) == 1:
        (value,) = extractions
        assert isinstance(value, float)
        return value

    if len(extractions) == 2:
        li_extractions: List[float] = sorted(extractions)
        if li_extractions[1] == BEST_RATING:
            value, _ = extractions
            assert isinstance(value, float)
            return value

    return None


def _extract_rating_stars_attrib(node: HtmlElement) -> Optional[float]:
    """Extract from title like "4 of out 5 stars"."""
    texts: List[str] = list(
        filter(
            None,
            (
                re.sub(r"\s+", " ", node.attrib.get(attrib, "")).lower().strip()
                for attrib in ["title", "alt", "aria-label"]
            ),
        )
    )
    assert BEST_RATING == 5
    for pattern in OF_STAR_PATTERNS:
        for text in texts:
            match = re.search(pattern, text)
            if match:
                return float(match.groups()[0])
    return None


def _extract_rating_stars_img(node: HtmlElement) -> Optional[float]:
    """Extract from the image name."""
    src = node.attrib.get("src", "").strip()
    if not src or src.startswith("data:"):
        return None
    name = urlparse(src).path.rsplit("/", 1)[-1]
    return _single_like_a_number(name)


def _single_like_a_number(text: str) -> Optional[float]:
    """Things similar to numbers in file names and URLs."""
    # 5.0, 5-0, 5_0, 50 are all fine
    numbers = re.findall(r"\d+[.\-_,]?\d*", text)
    if len(numbers) == 1:
        value = float(numbers[0].replace("-", ".").replace("_", ".").replace(",", "."))
        assert BEST_RATING == 5  # for below heuristics
        if value in {20, 30, 40, 50}:  # 10 is ambiguous, could be 10/10
            # FIXME 10 exclusion is a bit problematic as it creates a bias,
            # but any approach would have some bias, and it looks rare enough.
            value /= 10
        return value
    return None


N_CHILD_STARS = BEST_RATING


def _extract_rating_stars_nodes_quick_check(node: HtmlElement) -> bool:
    """Quick check whether an element might contain stars encoded as html."""
    children = list(node)
    if len(children) != N_CHILD_STARS:
        return False
    if len({ch.tag for ch in children}) != 1:
        return False
    return True


def _extract_rating_stars_nodes(node: HtmlElement) -> Optional[float]:
    """Look for N_CHILD_STARS children,
    first N of one kind and rest of another kind.
    """
    if not _extract_rating_stars_nodes_quick_check(node):
        return None
    children = list(node)
    child_ids = [tostring(ch, encoding="unicode").strip() for ch in children]
    if len(set(child_ids)) == 1:
        return float(N_CHILD_STARS)
    # this is quadratic but it's fine with low number of stars
    for n in range(1, N_CHILD_STARS):
        first, last = set(child_ids[:n]), set(child_ids[n:])
        if len(first) == len(last) == 1 and first != last:
            return float(n)
    return None


def _extract_rating_stars_class(node: HtmlElement) -> Optional[float]:
    """Extract rating from html class"""
    matches = set()
    for cls in node.attrib.get("class", "").lower().split():
        if "star" in cls or "rate" in cls or "rating" in cls:
            number = _single_like_a_number(cls)
            if number is not None and 1 <= number <= BEST_RATING:
                matches.add(number)
    if len(matches) == 1:
        (match,) = matches
        return match
    return None


def _extract_rating_stars_style_width(node: HtmlElement) -> Optional[float]:
    """Extract based on 'style="width:60%"' inline style."""
    style = node.attrib.get("style", "").lower().replace(" ", "")
    assert BEST_RATING == 5
    for rating, width in enumerate([20, 40, 60, 80, 100], 1):
        if f"width:{width}%" in style:
            return float(rating)
    return None