Source code for zyte_parsers.breadcrumbs

import re
import string
from collections import Counter
from typing import List, Optional, Tuple

import attr

from .api import SelectorOrElement, input_to_element
from .utils import extract_link, extract_text, first_satisfying


[docs]@attr.s(frozen=True, auto_attribs=True)
class Breadcrumb:
    name: Optional[str] = None
    url: Optional[str] = None


_PUNCTUATION_TRANS = str.maketrans("", "", string.punctuation)
_BREADCRUMBS_SEP = (
    "ᐊᐅ<>ᐸᐳ‹›≺≻≪≫«»⋘⋙❬❭❮❯❰❱⟨⟩⟪⟫⫷⫸〈〉《》⦉⦊⭅⭆⭠⭢←→↤↦⇐⇒⇠⇢"
    "⇦⇨⇽⇾⟵⟶⟸⟹⟻⟼⟽⟾⮘⮚⮜⮞⯇⯈⊲⊳◀▶◁▷◂▸◃▹◄►◅▻➜➝➞➟➠➡➢➣➤➧➨➩"
    "➪➫➬➭➮➯➱➲/⁄\\⟋⟍⫻⫼⫽|𐬻¦‖∣⎪⎟⎸⎹│┃┆┇┊┋❘❙❚.,+:-"
)
SEP_REG_STR = rf"([{_BREADCRUMBS_SEP}]+|->)"

SPLIT_REG = re.compile(rf"(^|\s+)[{_BREADCRUMBS_SEP}]+($|\s+)")
SEP_REG = re.compile(rf"^{SEP_REG_STR}$")
LSTRIP_SEP_REG = re.compile(rf"^{SEP_REG_STR}\s+")
RSTRIP_SEP_REG = re.compile(rf"\s+{SEP_REG_STR}$")


[docs]def extract_breadcrumbs(
    node: SelectorOrElement, *, base_url: Optional[str], max_search_depth: int = 10
) -> Optional[Tuple[Breadcrumb, ...]]:
    """Extract breadcrumb items from node that represents breadcrumb component.

    It finds all anchor elements to specified maximal depth. Anchors are
    collected in pre-order traversal. Such strategy of traversing supports
    cases where structure of nodes representing breadcrumbs is flat,
    which means that breadcrumb's anchors are on the same depth of HTML
    structure and where breadcrumb items are nested, which means that element
    with next item can be a child of element with previous breadcrumb item.
    It also post-processes extracted breadcrumbs by using semantic markup or
    the location of breadcrumb separators.

    :param node: Node representing and including breadcrumb component.
    :param base_url: Base URL of site.
    :param max_search_depth: Max depth for searching anchors.
    :return: Tuple with breadcrumb items.
    """

    def extract_breadcrumbs_rec(
        node,
        search_depth,
        breadcrumbs_accum,
        markup_hier_accum,
        separators_accum,
        list_tag_occured,
        curr_markup_hier,
    ):
        """
        Traverse html tree and search for elements that represent breadcrumb
        items with maximal depth of searching equal to `max_search_depth`.
        It also extracts breadcrumb items from element's tails since it often
        happens that non-anchor items are placed without any surrounding
        element.
        Because breadcrumb elements may contain dropdowns, the function
        filters them out by doing the following:
        * does not go into nested HTML list elements (<ol> and <ul>).
        * does not go into any HTML list elements with classes that relate
        to drop down, like "dropdown", "drop-down", "DropDown", etc.
        For every found element it does the following clean-up:
        * extracts name of breadcrumb from element's text or `title` attribute.
        * name cannot be a single character with punctuation like "»" or "|".
        * is able to parse name and split it from separators.
        * breadcrumb item has to contain name or url.
        * relative URLs are joined with base URL.
        """
        if node.tag in {"button"}:
            return

        if node.tag == "a" or len(node) == 0:
            name = first_satisfying(
                [
                    extract_text(node),
                    node.get("title").strip() if node.get("title") else None,
                ]
            )
            url = extract_link(node, base_url)

            left_sep, parsed_name, right_sep = _parse_breadcrumb_name(name)
            if left_sep and separators_accum and not separators_accum[-1]:
                separators_accum[-1] = left_sep
            if parsed_name or url:
                breadcrumbs_accum.append(Breadcrumb(parsed_name, url))
                markup_hier_accum.append(curr_markup_hier)
                separators_accum.append(right_sep)
        else:
            is_list_tag = node.tag in {"ul", "ol"}
            skip_list_tag = is_list_tag and (
                _has_special_class(node.get("class")) or list_tag_occured
            )

            item_type = _extract_markup_type(node)

            if search_depth < max_search_depth and not skip_list_tag:
                for child in node:
                    new_hierarchy = list(curr_markup_hier)
                    if item_type:
                        new_hierarchy.append(item_type)

                    extract_breadcrumbs_rec(
                        child,
                        search_depth + 1,
                        breadcrumbs_accum,
                        markup_hier_accum,
                        separators_accum,
                        list_tag_occured=list_tag_occured or is_list_tag,
                        curr_markup_hier=new_hierarchy,
                    )

        if node.tail is not None:
            left_sep, parsed_name, right_sep = _parse_breadcrumb_name(node.tail)
            if left_sep and separators_accum and not separators_accum[-1]:
                separators_accum[-1] = left_sep
            if parsed_name:
                breadcrumbs_accum.append(Breadcrumb(name=parsed_name))
                markup_hier_accum.append(curr_markup_hier)
                separators_accum.append(right_sep)

    node = input_to_element(node)

    breadcrumbs: List[Breadcrumb] = []
    markup_hier: List[List[str]] = []
    separators: List[bool] = []
    extract_breadcrumbs_rec(
        node,
        0,
        breadcrumbs,
        markup_hier,
        separators,
        list_tag_occured=False,
        curr_markup_hier=[],
    )
    assert len(breadcrumbs) == len(markup_hier) == len(separators)

    return _postprocess_breadcrumbs(breadcrumbs, markup_hier, separators)


def _parse_breadcrumb_name(
    name: Optional[str],
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """Split extracted name into left separator, clean name and right separator."""
    if name:
        stripped_name = name.strip()
        if SEP_REG.match(stripped_name):
            return stripped_name.strip(), None, None

        left_match = LSTRIP_SEP_REG.match(stripped_name)
        left_sep = left_match.group().strip() if left_match else None
        without_left_sep = (
            stripped_name[left_match.end() :] if left_match else stripped_name
        )

        if SEP_REG.match(without_left_sep):
            return left_sep, None, without_left_sep.strip()

        right_match = RSTRIP_SEP_REG.search(without_left_sep)
        right_sep = right_match.group().strip() if right_match else None
        name = (
            without_left_sep[: right_match.start()] if right_match else without_left_sep
        )

        return left_sep, name or None, right_sep
    return None, None, None


def _postprocess_breadcrumbs(breadcrumbs, markup_hier, separators):
    """
    Post-process breadcrumbs using the following procedures:
    * If there is only a single breadcrumb with name and without link, try to
    split the name into separate breadcrumb items.
    * If markup exists, then use it for selecting correct breadcrumb items.
    * Otherwise, use location of separators to determine which breadcrumb items
    are relevant and which not (if there is separator between two items then
    these two items are relevant).
    """
    if not breadcrumbs:
        return None

    if len(breadcrumbs) == 1 and breadcrumbs[0].name and not breadcrumbs[0].url:
        parts = (s.strip() for s in SPLIT_REG.split(breadcrumbs[0].name))
        return tuple(Breadcrumb(name=p) for p in parts if p)

    markup_exists = any(len(h) > 0 for h in markup_hier)

    if markup_exists:
        breadcrumbs = _postprocess_using_markup(breadcrumbs, markup_hier)
    else:
        breadcrumbs = _postprocess_using_separators(breadcrumbs, separators)

    return tuple(_remove_duplicated_first_and_last_items(breadcrumbs))


def _postprocess_using_markup(breadcrumbs, markup_hier):
    breadcrumb_indices_with_markup = [
        idx for idx, h in enumerate(markup_hier) if len(h) > 0
    ]
    first_with_markup = min(breadcrumb_indices_with_markup, default=-1)
    last_with_markup = max(breadcrumb_indices_with_markup, default=-1)

    # often the items without markup at the beginning and the end are
    # respectively home and product items
    indices_to_leave = {first_with_markup - 1, last_with_markup + 1}

    return [
        b
        for idx, (b, h) in enumerate(zip(breadcrumbs, markup_hier))
        if idx in indices_to_leave or len(h) > 0
    ]


def _postprocess_using_separators(breadcrumbs, separators):
    def prev_sep(idx):
        return separators[idx - 1] if 0 <= idx - 1 < len(separators) else None

    most_common_seps = Counter(filter(None, separators)).most_common()
    main_sep = most_common_seps[0][0] if most_common_seps else None

    if not main_sep:
        return breadcrumbs

    return [
        b
        for idx, (b, sep) in enumerate(zip(breadcrumbs, separators))
        if sep == main_sep or (prev_sep(idx) == main_sep)
    ]


def _extract_markup_type(node):
    def check_schema(name):
        for schema_attr in {"itemtype", "typeof"}:
            if name in node.get(schema_attr, "").lower():
                return True
        return False

    if check_schema("data-vocabulary.org/breadcrumb"):
        return "data-vocabulary"
    if check_schema("listitem"):
        return "schema"


def _remove_duplicated_first_and_last_items(breadcrumbs):
    """
    Remove "go back" urls from the beginning or the end of breadcrumb
    element.
    There is an assumption that there can be only one such url.
    First it tries to remove url at the beginning by checking if there
    is any other the same url in further breadcrumb items. If not, it
    checks the last url by comparing it with remaining urls.
    """
    first_url = breadcrumbs[0].url
    if first_url is not None and first_url in (b.url for b in breadcrumbs[1:] if b.url):
        return breadcrumbs[1:]
    last_url = breadcrumbs[-1].url
    if last_url is not None and last_url in (b.url for b in breadcrumbs[1:-1] if b.url):
        return breadcrumbs[:-1]
    return breadcrumbs


def _has_special_class(class_attr: str) -> bool:
    """
    Check if a given value of class attribute has a class that relates to
    drop down like "dropdown", "drop-down", "DropDown", etc.
    """
    if class_attr:
        return any(
            cls_name in c.translate(_PUNCTUATION_TRANS).lower().strip()
            for cls_name in {"dropdown", "actions"}
            for c in class_attr.split()
        )
    return False