Source code for scistag.emojistag.emoji_db

"""
Defines the class :class:`EmojiDb` which is the heart of this package.

It loads and provides information about all known Emojis and their details
such as the unicode encoding or their category.
"""

from __future__ import annotations
import io
import json
from fnmatch import fnmatch

import scistag.addons
from scistag.common.mt.stag_lock import StagLock
from scistag.common.essential_data import get_edp
from scistag.filestag.file_stag import FileStag

from .emoji_definitions import (EmojiIdentifierTypes, EMOJI_SVG_ADDON,
                                EMOJI_NAMES, EMOJI_DB_NAME,
                                EMOJI_NAMES_MARKDOWN)
from .emoji_info import EmojiInfo


[docs]class EmojiDb:
    """
    The Emoji DB provides Emoji and country flag graphics.
    By default it uses the Noto Emoji dataset embedded into the SciStag module.
    """

    _access_lock = StagLock()
    "Shared access lock"
    _initialized = False
    "Defines if the emoji db was initialized"
    _extensions = {}
    "List of known emoji addon packages"
    _svg_emojis = False
    "Defines if SVG emojis are available"
    _markdown_names = {}
    "Markdown name conversion dictionary"
    _unicode_names = {}
    """
    The dictionary contains all official names of the emojis and their
    corresponding unicode sequence
    """
    _valid_sequences = set()
    "Set of valid unicode sequences"
    _main_dict = {}
    """
    Contains all details about every single known emoji such as name,
    category, subcategory and of course unicode sequence
    """

[docs]    @classmethod
    def _get_markdown_dict(cls) -> dict:
        """
        Returns the markdown name dictionary. Contains all common markdown
        emoji names as key and their corresponding unique sequence as value

        :return: The dictionary
        """
        with cls._access_lock:
            if len(cls._markdown_names) > 0:
                return cls._markdown_names
            edp = get_edp()
            file_data = FileStag.load(edp + EMOJI_NAMES_MARKDOWN)
            cls._markdown_names = json.load(io.BytesIO(file_data))
            return cls._markdown_names

[docs]    @classmethod
    def _get_unicode_dict(cls) -> dict:
        """
        Returns the unicode name dictionary. Contains all common emoji names as
        key and their corresponding unique sequence as value for more than
        3600 emojis. See unicode.org for more details.

        :return: The dictionary
        """
        with cls._access_lock:
            if len(cls._unicode_names) > 0:
                return cls._unicode_names
            edp = get_edp()
            file_data = FileStag.load(edp + EMOJI_NAMES)
            cls._unicode_names = json.load(io.BytesIO(file_data))
            return cls._unicode_names

[docs]    @classmethod
    def get_all_valid_sequences(cls) -> set:
        """
        Returns a set of all (known) valid emoji sequences

        :return A set of valid emoji sequences (all uppercased and with
            an underscore separating the single elements)
        """
        with cls._access_lock:
            if len(cls._valid_sequences) > 0:
                return cls._valid_sequences
            unicode_dict = cls._get_unicode_dict()
            cls._valid_sequences = set(unicode_dict.values())
            return cls._valid_sequences

[docs]    @classmethod
    def _get_main_dict(cls) -> dict:
        """
        Returns the main dictionary containing all details about an emoji.

        :return: The dictionary
        """
        with cls._access_lock:
            if len(cls._main_dict) > 0:
                return cls._main_dict
            edp = get_edp()
            file_data = FileStag.load(edp + EMOJI_DB_NAME)
            cls._main_dict = json.load(io.BytesIO(file_data))
            # hotfix for property remaining, will be fixed with next data
            # update
            for key, value in cls._main_dict.items():
                value["sequence"] = key.split("_")
                value["category"] = value["group"]
                value["subcategory"] = value["subgroup"]
            return cls._main_dict

[docs]    @classmethod
    def get_sequence_for_name(cls, identifier: str) -> list:
        """
        Returns the unicode sequence for given unicode identifier

        :param identifier: Either the full qualified identifier as defined by
            unicode.org supporting all >3600 emojis as defined by unicode.org.
            or the markdown  shortcode enclosed by two colons such as ":deer:"
            as defined on GitHub.
        :return: The unicode sequence if the emoji could be found,
            otherwise an empty list
        """
        if identifier.startswith(":") and identifier.endswith(":"):
            return cls._get_markdown_dict().get(identifier[1:-1], "").split("_")
        unicode_dict = cls._get_unicode_dict()
        if identifier in unicode_dict:
            return cls._get_unicode_dict().get(identifier, "").split("_")
        sequence = identifier.encode('unicode-escape').decode('ascii')
        sequence = sequence.split("\\")[1:]
        sequence = [element.lstrip("Uu").lstrip("0") for element in sequence]
        if cls.validate_sequence(sequence):
            return sequence
        return []

[docs]    @classmethod
    def get_character_sequence(cls, identifier: EmojiIdentifierTypes) -> \
            list[str]:
        """
        Converts an emoji identifier to a unicode character sequence which
        can be printed to the console or a markdown document.

        Does not alter the value if a unicode sequence was passed already.

        :param identifier: The emoji identifier, either it's unicode name,
            markdown name surrounded by colons or a single emoji character.
        :return: The unicode characters
        """
        if isinstance(identifier, str):
            identifier = cls.get_sequence_for_name(identifier)
        return identifier

[docs]    @classmethod
    def get_character(cls, identifier: EmojiIdentifierTypes) -> str:
        """
        :param identifier: The emoji's identifier

        :return: The emoji character (if valid), otherwise an empty string
        """
        sequence = cls.get_character_sequence(identifier)
        if len(sequence) == 0:
            return ""
        encoding = "".join(["\\U" + element.zfill(8) for element in sequence])
        ascii_encoding = encoding.encode("ASCII")
        return ascii_encoding.decode("unicode-escape")

[docs]    @classmethod
    def validate_sequence(cls, sequence: list[str]):
        """
        Returns if given sequence is known (in the current version) of our
        Emoji database.

        :param sequence: The sequence as unicode characters without leading
            zeros.
        :return: True if the sequence is known
        """
        return "_".join(sequence).upper() in cls.get_all_valid_sequences()

[docs]    @classmethod
    def get_extensions(cls) -> dict:
        """
        Returns all available emoji extensions

        :return: Dictionary of extensions and their corresponding FileStag path
            to access their data
        """
        from scistag.imagestag import svg
        with cls._access_lock:
            if not cls._initialized:
                cls._extensions = \
                    scistag.addons.AddonManager.get_addons_paths("emojis.*")
                cls._initialized = True
                cls._svg_emojis = \
                    EMOJI_SVG_ADDON in cls._extensions and \
                    svg.SvgRenderer.available()
        return cls._extensions

[docs]    @classmethod
    def get_svg_support(cls) -> bool:
        """
        Returns if SVG rendering is supported tne SVG repo installed

        :return: True if high quality rendering is possible
        """
        cls.get_extensions()
        return cls._svg_emojis

[docs]    @classmethod
    def get_svg(cls, sequence: list[str]) -> bytes | None:
        """
        Tries to read the SVG of an emoji from the database

        :param sequence: The unicode sequence, e.g. ["u1f98c"] for a stag
        :return: The SVG data on success, otherwise None
        """
        extensions = cls.get_extensions()
        if EMOJI_SVG_ADDON not in extensions:
            return None
        lower_cased = [element.lower() for element in sequence]
        combined = "_".join(lower_cased)
        emoji_path = \
            extensions[
                EMOJI_SVG_ADDON] + \
            f"images/noto/emojis/svg/emoji_u{combined}.svg"
        return FileStag.load(emoji_path)

[docs]    @classmethod
    def get_png(cls,
                sequence: list[str]) -> bytes | None:
        """
        Tries to read the SVG of an emoji from the database

        :param sequence: The unicode sequence, e.g. ["1f98c"] for a stag
        :return: The SVG data on success, otherwise None
        """
        lower_cased = [element.lower() for element in sequence]
        combined = "_".join(lower_cased)
        edp = get_edp()
        emoji_path = edp + f"images/noto/cpngs/emoji_u{combined}.png"
        return FileStag.load(emoji_path)

[docs]    @classmethod
    def get_details(cls, sequence: list[str]) -> EmojiInfo | None:
        """
        Returns details about am emoji

        :param sequence: The unicode sequence without leading zeros.
        :return: The EmojiInfo object if available
        """
        main_db = cls._get_main_dict()
        upper_cased = [element.upper() for element in sequence]
        combined = "_".join(upper_cased)
        if combined in main_db:
            return EmojiInfo.parse_obj(main_db[combined])
        return None

[docs]    @classmethod
    def get_categories(cls) -> list[str]:
        """
        Returns a list of all emoji main categories

        :return: A list of all known emoji categories
        """
        main_dict = cls._get_main_dict()
        categories = set(
            [element["category"] for element in main_dict.values()])
        return sorted(list(categories))

[docs]    @classmethod
    def get_sub_categories(cls, category: str) -> list[str]:
        """
        Returns a list of all emoji sub categories of given category

        :param category: The category's name
        :return: A list of subcategories in this category
        """
        main_dict = cls._get_main_dict()
        filtered_list = [element['subcategory'] for element in
                         main_dict.values() if
                         element['category'] == category]
        return sorted(list(set(filtered_list)))

[docs]    @classmethod
    def get_emojis_in_category(cls, category: str, subcategory: str | None) -> \
            list[EmojiInfo]:
        """
        Returns all emojis in the defined category and subcategory

        :param category: The main category's name as obtained by
            :meth:`get_categories`.
        :param subcategory: The name of the subcategory. If no subcategory is
            provided all emojis in the category will be returned.
        :return: A list of all emojis in given category and subcategory
        """
        main_dict = cls._get_main_dict()
        if subcategory is not None:
            filtered_list = [EmojiInfo.parse_obj(element) for element in
                             main_dict.values() if
                             element['category'] == category and
                             element['subcategory'] == subcategory]
        else:
            filtered_list = [EmojiInfo.parse_obj(element) for element in
                             main_dict.values() if
                             element['category'] == category]
        return sorted(filtered_list, key=lambda element: element.name)

[docs]    @classmethod
    def find_emojis_by_name(cls, name_mask: str, md: bool = False):
        """
        Returns all emojis which match the defined search pattern

        :param name_mask: The name mask to search for, e.g *sun*
        :param md: Defines if the GitHub markdown db name shall be used instead
            of the full unicode name list.
        :return: A list of all matching Emojis
        """
        main_dict = cls._get_main_dict()
        if md:
            return [EmojiInfo.parse_obj(element) for element in
                    main_dict.values() if
                    fnmatch(element.get('markdownName', ""), name_mask)]
        else:
            return [EmojiInfo.parse_obj(element) for element in
                    main_dict.values() if
                    fnmatch(element['name'], name_mask)]

[docs]    @classmethod
    def __getitem__(cls, key: str) -> EmojiInfo | None:
        """
        Returns the emoji details for a specific Emoji by it's name.

        :param key: Either the full qualified name as defined in the unicode
            database or its GitHub markdown name surrounded by colons, eg
            "deer" or ":deer:"
        :return: The Emoji description
        """
        sequence = cls.get_sequence_for_name(key)
        if len(sequence) > 0:
            return cls.get_details(sequence)
        raise KeyError("Emoji not found")


[docs]def get_emoji_sequence(identifier: EmojiIdentifierTypes) -> list[str]:
    """
    Converts an emoji name such as ``"deer"``,
    (see :meth:`EmojiDb.get_unicode_dict`, a markdown
    emoji identifier such as ``":deer:"`` (see GitHub) or a single unicode
    character to a unicode sequence.

    Does not alter the value if a unicode sequence was passed already.

    :param identifier: The emoji identifier, either it's name such as "deer",
        it's markdown name such as ":deer:" or a single unicode character
        like 🦌 a unicode sequence as list of strings.
    :return: The unicode sequence, e.g. ["1f98c"] for a deer
    """
    return EmojiDb.get_character_sequence(identifier=identifier)


[docs]def get_emoji_sequence_valid(sequence: list[str]) -> bool:
    """
    Returns if given emoji sequence is know to our current database

    :param sequence: The sequence, a lift of unicode sequence components
        without leading zeros as returned by :meth:`get_emoji_sequence`.
    :return: True if the sequence is known
    """
    return EmojiDb.validate_sequence(sequence)


[docs]def get_emoji_character(identifier: EmojiIdentifierTypes) -> str:
    """
    Returns the emoji unicode character for an emoji name.

    :param identifier: The emoji identifier, either it's name such as "deer",
        it's markdown name such as ":deer:" or a single unicode character
        like 🦌 or a unicode sequence as list of strings.
    :return: If the Emoji could be found a single unicode emoji character
        such as 🦌, otherwise an empty string.
    """
    return EmojiDb.get_character(identifier)


[docs]def get_emoji_details(identifier: EmojiIdentifierTypes) -> EmojiInfo | None:
    """
    Returns details about am emoji

    :param identifier: The emoji identifier, either it's name such as "deer",
        it's markdown name such as ":deer:", a single unicode character
        like 🦌 or a combined character like 🇩🇪.
    :return: The Emoji info object if available providing all information
        about the emoji stored in our db.
    """
    return EmojiDb.get_details(EmojiDb.get_character_sequence(identifier))


[docs]def find_emojis_by_name(name_mask: str, md: bool = False):
    """
    Returns all emojis which match the defined search pattern

    :param name_mask: The name mask to search for, e.g *sun*
    :param md: Defines if the GitHub markdown db name shall be used instead
        of the full unicode name list.
    :return: A list of all matching Emojis
    """
    return EmojiDb.find_emojis_by_name(name_mask=name_mask, md=md)