Source code for zyte_parsers.gtin

import re
from contextlib import suppress
from typing import Optional, Union

import attr
from gtin.validator import is_valid_GTIN
from stdnum import isbn, ismn, issn

from . import SelectorOrElement
from .utils import extract_text


[docs]@attr.s(frozen=True, auto_attribs=True) class Gtin: type: str value: str
GTIN_MATCH_SPECIAL_CHARACTER_REGEX = re.compile(r"[^0-9a-zA-Z]") GTIN_MATCH_NON_NUMERIC_REGEX = re.compile(r"[^0-9]") # Consider only those prefix which have some numeric values as these # values interfere with the gtin id extraction GTIN_PREFIX = [ "isbn13", "isbn10", "ean13", "gtin8", "gtin12", "gtin13", "gtin14", ] GTIN_PREFIX_REGEX = re.compile("|".join(GTIN_PREFIX), re.IGNORECASE) GTIN_CENTER_REGEX = re.compile(r"^\D*|\D*$")
[docs]def extract_gtin(node: Union[SelectorOrElement, str]) -> Optional[Gtin]: """Extract a GTIN (Global Trade Item Number) from a node or a string that contains its text. It detects the GTIN type and returns it together with the cleaned GTIN value. The following types are supported: `isbn10`, `isbn13`, `issn`, `ismn`, `upc`, `gtin8`, `gtin13`, `gtin14`. :param node: A node or a string that includes the GTIN text. :return: A GTIN item. """ gtin = node if isinstance(node, str) else extract_text(node) gtin_id = extract_gtin_id(gtin) gtin_class = gtin_classification(gtin_id) if gtin_class: assert isinstance(gtin_id, str) return Gtin(gtin_class, gtin_id) return None
def _remove_gtin_numeric_prefix(gtin_code: str) -> str: """ The function removes the gtin specific numeric prefix from the gtin text if length after prefix removal is the expected length for that prefix. E.g. ean13 is a prefix with a numeric value 13 in it. Text ean13 is removed only if after removal the length of numeric code is 13. It is done in order to avoid cases where we by mistake might remove the digits 13 from the actual gtin code. """ prefix_match = GTIN_PREFIX_REGEX.search(gtin_code) if prefix_match: prefix = prefix_match.group() s, e = prefix_match.span() gtin_without_prefix = gtin_code[:s] + gtin_code[e:] gtin_expected_len = int(GTIN_MATCH_NON_NUMERIC_REGEX.sub("", prefix)) numeric_code_without_prefix = GTIN_MATCH_NON_NUMERIC_REGEX.sub( "", gtin_without_prefix ) if len(numeric_code_without_prefix) == gtin_expected_len: return gtin_without_prefix return gtin_code def extract_gtin_id(gtin_code: Optional[str]) -> Optional[str]: """ The function extracts the gtin_id from the text. For text like 'EAN13: 7350053850019', first 'EAN13' is extracted then the gtin_id is extracted. For some tricky sku values like 'TSF8UP-R407-26A44' if we only remove non-numeric values we end up with number like '84072644' which is classified as gtin (issn), therefore, we also check that the gtin_id does not contain any letter between the numeric values. """ if gtin_code: gtin_id_alphanum = GTIN_MATCH_SPECIAL_CHARACTER_REGEX.sub("", gtin_code) gtin_id_suffix = _remove_gtin_numeric_prefix(gtin_id_alphanum) gtin_center = GTIN_CENTER_REGEX.sub("", gtin_id_suffix) gtin_id = GTIN_MATCH_NON_NUMERIC_REGEX.sub("", gtin_center) if gtin_id == gtin_center: return gtin_id return None def gtin_classification(gtin: Optional[str]) -> Optional[str]: """ The function performs gtin classification for the gtin code. The gtin classification is performed based on a number of rules associated with the different gtin categories. The categories considered here for gtin classification are : -isbn10 -isbn13 -issn -ismn -upc -ean13 Return: gtin_class(str) if a class is found else None """ gtin = extract_gtin_id(gtin) if not gtin: return None with suppress(Exception): if ismn.validate(gtin): return "ismn" with suppress(Exception): if isbn.validate(gtin): if len(gtin) == 10: return "isbn10" if len(gtin) == 13: return "isbn13" with suppress(Exception): if issn.validate(gtin): return "issn" if is_valid_GTIN(gtin): if len(gtin) == 8: return "gtin8" if len(gtin) == 14: return "gtin14" if len(gtin) == 13: return "gtin13" if len(gtin) == 12: return "upc" return None