Source code for zyte_parsers.brand

import itertools
from typing import Iterable, Optional

from lxml.html import HtmlElement

from . import SelectorOrElement
from .api import input_to_element
from .utils import extract_text, iterwalk_limited, take


[docs]def extract_brand_name(node: SelectorOrElement, search_depth: int = 0) -> Optional[str]: """Extract a brand name from a node that contains it. It tries element text and image alt and title attributes. :param node: Node including the brand name. :param search_depth: Max depth for searching images. :return: The brand name or None. """ _BRAND_LENGHT_LIMIT = 50 node = input_to_element(node) extracted = _extract_brand(node, search_depth) short = (b for b in extracted if b and len(b) < _BRAND_LENGHT_LIMIT) results = take(short, 1) return results[0] if results else None
def _extract_brand(node: HtmlElement, search_depth: int = 0) -> Iterable[Optional[str]]: if node.tag == "img": return extract_image_text(node, 0) value = extract_text(node) if value: return [value] return extract_image_text(node, search_depth) def extract_image_text(node: HtmlElement, search_depth: int = 0) -> Iterable[str]: def extract_text_from_image(node: HtmlElement) -> Iterable[Optional[str]]: for attrib in ["alt", "title"]: yield (node.attrib.get(attrib) or "").strip() nodes = iterwalk_limited(node, search_depth) images = filter(lambda n: n.tag == "img", nodes) attribs = map(extract_text_from_image, images) flat_attribs = itertools.chain.from_iterable(attribs) valid_attribs = (a for a in flat_attribs if a) return valid_attribs