2026-1-6
This commit is contained in:
361
venv/Lib/site-packages/emoji/tokenizer.py
Normal file
361
venv/Lib/site-packages/emoji/tokenizer.py
Normal file
@@ -0,0 +1,361 @@
|
||||
"""
|
||||
emoji.tokenizer
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Components for detecting and tokenizing emoji in strings.
|
||||
|
||||
"""
|
||||
from typing import NamedTuple, Dict, Union, Iterator, Any
|
||||
from emoji import unicode_codes
|
||||
|
||||
|
||||
__all__ = [
|
||||
'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token',
|
||||
'tokenize', 'filter_tokens',
|
||||
]
|
||||
|
||||
_ZWJ = '\u200D'
|
||||
_SEARCH_TREE = None
|
||||
|
||||
|
||||
class EmojiMatch:
|
||||
"""
|
||||
Represents a match of a "recommended for general interchange" (RGI)
|
||||
emoji in a string.
|
||||
"""
|
||||
|
||||
__slots__ = ('emoji', 'start', 'end', 'data')
|
||||
|
||||
def __init__(self, emoji: str, start: int,
|
||||
end: int, data: Union[dict, None]):
|
||||
|
||||
self.emoji = emoji
|
||||
"""The emoji substring"""
|
||||
|
||||
self.start = start
|
||||
"""The start index of the match in the string"""
|
||||
|
||||
self.end = end
|
||||
"""The end index of the match in the string"""
|
||||
|
||||
self.data = data
|
||||
"""The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI"""
|
||||
|
||||
def data_copy(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Returns a copy of the data from :data:`EMOJI_DATA` for this match
|
||||
with the additional keys ``match_start`` and ``match_end``.
|
||||
"""
|
||||
if self.data:
|
||||
emj_data = self.data.copy()
|
||||
emj_data['match_start'] = self.start
|
||||
emj_data['match_end'] = self.end
|
||||
return emj_data
|
||||
else:
|
||||
return {
|
||||
'match_start': self.start,
|
||||
'match_end': self.end
|
||||
}
|
||||
|
||||
def is_zwj(self) -> bool:
|
||||
"""
|
||||
Checks if this is a ZWJ-emoji.
|
||||
|
||||
:returns: True if this is a ZWJ-emoji, False otherwise
|
||||
"""
|
||||
|
||||
return _ZWJ in self.emoji
|
||||
|
||||
def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']:
|
||||
"""
|
||||
Splits a ZWJ-emoji into its constituents.
|
||||
|
||||
:returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self
|
||||
"""
|
||||
|
||||
if self.is_zwj():
|
||||
return EmojiMatchZWJ(self)
|
||||
else:
|
||||
return self
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})'
|
||||
|
||||
|
||||
class EmojiMatchZWJ(EmojiMatch):
|
||||
"""
|
||||
Represents a match of multiple emoji in a string that were joined by
|
||||
zero-width-joiners (ZWJ/``\\u200D``)."""
|
||||
|
||||
__slots__ = ('emojis', )
|
||||
|
||||
def __init__(self, match: EmojiMatch):
|
||||
super().__init__(match.emoji, match.start, match.end, match.data)
|
||||
|
||||
self.emojis = []
|
||||
"""List of sub emoji as EmojiMatch objects"""
|
||||
|
||||
i = match.start
|
||||
for e in match.emoji.split(_ZWJ):
|
||||
m = EmojiMatch(
|
||||
e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None))
|
||||
self.emojis.append(m)
|
||||
i += len(e) + 1
|
||||
|
||||
def join(self) -> str:
|
||||
"""
|
||||
Joins a ZWJ-emoji into a string
|
||||
"""
|
||||
|
||||
return _ZWJ.join(e.emoji for e in self.emojis)
|
||||
|
||||
def is_zwj(self) -> bool:
|
||||
return True
|
||||
|
||||
def split(self) -> 'EmojiMatchZWJ':
|
||||
return self
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})'
|
||||
|
||||
|
||||
class EmojiMatchZWJNonRGI(EmojiMatchZWJ):
|
||||
"""
|
||||
Represents a match of multiple emoji in a string that were joined by
|
||||
zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji
|
||||
that are not "recommended for general interchange" (non-RGI) by Unicode.org.
|
||||
The data property of this class is always None.
|
||||
"""
|
||||
|
||||
def __init__(self, first_emoji_match: EmojiMatch,
|
||||
second_emoji_match: EmojiMatch):
|
||||
|
||||
self.emojis = [first_emoji_match, second_emoji_match]
|
||||
"""List of sub emoji as EmojiMatch objects"""
|
||||
|
||||
self._update()
|
||||
|
||||
def _update(self):
|
||||
self.emoji = _ZWJ.join(e.emoji for e in self.emojis)
|
||||
self.start = self.emojis[0].start
|
||||
self.end = self.emojis[-1].end
|
||||
self.data = None
|
||||
|
||||
def _add(self, next_emoji_match: EmojiMatch):
|
||||
self.emojis.append(next_emoji_match)
|
||||
self._update()
|
||||
|
||||
|
||||
class Token(NamedTuple):
|
||||
"""
|
||||
A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji
|
||||
or a single character that is not a unicode emoji.
|
||||
"""
|
||||
chars: str
|
||||
value: Union[str, EmojiMatch]
|
||||
|
||||
|
||||
def tokenize(string, keep_zwj: bool) -> Iterator[Token]:
|
||||
"""
|
||||
Finds unicode emoji in a string. Yields all normal characters as a named
|
||||
tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``.
|
||||
|
||||
:param string: String contains unicode characters. MUST BE UNICODE.
|
||||
:param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be
|
||||
skipped or should be yielded as normal characters
|
||||
:return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)``
|
||||
"""
|
||||
|
||||
tree = get_search_tree()
|
||||
EMOJI_DATA = unicode_codes.EMOJI_DATA
|
||||
# result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ]
|
||||
result = []
|
||||
i = 0
|
||||
length = len(string)
|
||||
ignore = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences
|
||||
while i < length:
|
||||
consumed = False
|
||||
char = string[i]
|
||||
if i in ignore:
|
||||
i += 1
|
||||
if char == _ZWJ and keep_zwj:
|
||||
result.append(Token(char, char))
|
||||
continue
|
||||
|
||||
elif char in tree:
|
||||
j = i + 1
|
||||
sub_tree = tree[char]
|
||||
while j < length and string[j] in sub_tree:
|
||||
if j in ignore:
|
||||
break
|
||||
sub_tree = sub_tree[string[j]]
|
||||
j += 1
|
||||
if 'data' in sub_tree:
|
||||
emj_data = sub_tree['data']
|
||||
code_points = string[i:j]
|
||||
|
||||
# We cannot yield the result here, we need to defer
|
||||
# the call until we are sure that the emoji is finished
|
||||
# i.e. we're not inside an ongoing ZWJ-sequence
|
||||
match_obj = EmojiMatch(code_points, i, j, emj_data)
|
||||
|
||||
i = j - 1
|
||||
consumed = True
|
||||
result.append(Token(code_points, match_obj))
|
||||
|
||||
elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree:
|
||||
# the current char is ZWJ and the last match was an emoji
|
||||
ignore.append(i)
|
||||
if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]:
|
||||
# last match was a component, it could be ZWJ+EMOJI+COMPONENT
|
||||
# or ZWJ+COMPONENT
|
||||
i = i - sum(len(t.chars) for t in result[-2:])
|
||||
if string[i] == _ZWJ:
|
||||
# It's ZWJ+COMPONENT, move one back
|
||||
i += 1
|
||||
del result[-1]
|
||||
else:
|
||||
# It's ZWJ+EMOJI+COMPONENT, move two back
|
||||
del result[-2:]
|
||||
else:
|
||||
# last match result[-1] was a normal emoji, move cursor
|
||||
# before the emoji
|
||||
i = i - len(result[-1].chars)
|
||||
del result[-1]
|
||||
continue
|
||||
|
||||
elif result:
|
||||
yield from result
|
||||
result = []
|
||||
|
||||
if not consumed and char != '\uFE0E' and char != '\uFE0F':
|
||||
result.append(Token(char, char))
|
||||
i += 1
|
||||
|
||||
yield from result
|
||||
|
||||
|
||||
def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]:
|
||||
"""
|
||||
Filters the output of `tokenize()`
|
||||
|
||||
:param matches: An iterable of tuples of the form ``(match_str, result)``
|
||||
where ``result`` is either an EmojiMatch or a string.
|
||||
:param emoji_only: If True, only EmojiMatch are returned in the output.
|
||||
If False all characters are returned
|
||||
:param join_emoji: If True, multiple EmojiMatch are merged into
|
||||
a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ.
|
||||
|
||||
:return: An iterable of tuples :class:`Token` ``(char, char)``,
|
||||
:class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)``
|
||||
"""
|
||||
|
||||
if not join_emoji and not emoji_only:
|
||||
yield from matches
|
||||
return
|
||||
|
||||
if not join_emoji:
|
||||
for token in matches:
|
||||
if token.chars != _ZWJ:
|
||||
yield token
|
||||
return
|
||||
|
||||
# Combine multiple EmojiMatch that are separated by ZWJs into
|
||||
# a single EmojiMatchZWJNonRGI
|
||||
previous_is_emoji = False
|
||||
previous_is_zwj = False
|
||||
pre_previous_is_emoji = False
|
||||
accumulator = []
|
||||
for token in matches:
|
||||
pre_previous_is_emoji = previous_is_emoji
|
||||
if previous_is_emoji and token.value == _ZWJ:
|
||||
previous_is_zwj = True
|
||||
elif isinstance(token.value, EmojiMatch):
|
||||
if pre_previous_is_emoji and previous_is_zwj:
|
||||
if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
|
||||
accumulator[-1].value._add(token.value)
|
||||
accumulator[-1] = Token(accumulator[-1].chars +
|
||||
_ZWJ + token.chars, accumulator[-1].value)
|
||||
else:
|
||||
prev = accumulator.pop()
|
||||
accumulator.append(
|
||||
Token(prev.chars + _ZWJ + token.chars,
|
||||
EmojiMatchZWJNonRGI(
|
||||
prev.value,
|
||||
token.value)))
|
||||
else:
|
||||
accumulator.append(token)
|
||||
previous_is_emoji = True
|
||||
previous_is_zwj = False
|
||||
else:
|
||||
# Other character, not an emoji
|
||||
previous_is_emoji = False
|
||||
previous_is_zwj = False
|
||||
yield from accumulator
|
||||
if not emoji_only:
|
||||
yield token
|
||||
accumulator = []
|
||||
yield from accumulator
|
||||
|
||||
|
||||
def get_search_tree() -> Dict[str, Any]:
|
||||
"""
|
||||
Generate a search tree for demojize().
|
||||
Example of a search tree::
|
||||
|
||||
EMOJI_DATA =
|
||||
{'a': {'en': ':Apple:'},
|
||||
'b': {'en': ':Bus:'},
|
||||
'ba': {'en': ':Bat:'},
|
||||
'band': {'en': ':Beatles:'},
|
||||
'bandit': {'en': ':Outlaw:'},
|
||||
'bank': {'en': ':BankOfEngland:'},
|
||||
'bb': {'en': ':BB-gun:'},
|
||||
'c': {'en': ':Car:'}}
|
||||
|
||||
_SEARCH_TREE =
|
||||
{'a': {'data': {'en': ':Apple:'}},
|
||||
'b': {'a': {'data': {'en': ':Bat:'},
|
||||
'n': {'d': {'data': {'en': ':Beatles:'},
|
||||
'i': {'t': {'data': {'en': ':Outlaw:'}}}},
|
||||
'k': {'data': {'en': ':BankOfEngland:'}}}},
|
||||
'b': {'data': {'en': ':BB-gun:'}},
|
||||
'data': {'en': ':Bus:'}},
|
||||
'c': {'data': {'en': ':Car:'}}}
|
||||
|
||||
_SEARCH_TREE
|
||||
/ | ⧵
|
||||
/ | ⧵
|
||||
a b c
|
||||
| / | ⧵ |
|
||||
| / | ⧵ |
|
||||
:Apple: ba :Bus: bb :Car:
|
||||
/ ⧵ |
|
||||
/ ⧵ |
|
||||
:Bat: ban :BB-gun:
|
||||
/ ⧵
|
||||
/ ⧵
|
||||
band bank
|
||||
/ ⧵ |
|
||||
/ ⧵ |
|
||||
bandi :Beatles: :BankOfEngland:
|
||||
|
|
||||
bandit
|
||||
|
|
||||
:Outlaw:
|
||||
|
||||
|
||||
"""
|
||||
global _SEARCH_TREE
|
||||
if _SEARCH_TREE is None:
|
||||
_SEARCH_TREE = {}
|
||||
for emj in unicode_codes.EMOJI_DATA:
|
||||
sub_tree = _SEARCH_TREE
|
||||
lastidx = len(emj) - 1
|
||||
for i, char in enumerate(emj):
|
||||
if char not in sub_tree:
|
||||
sub_tree[char] = {}
|
||||
sub_tree = sub_tree[char]
|
||||
if i == lastidx:
|
||||
sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
|
||||
return _SEARCH_TREE
|
||||
Reference in New Issue
Block a user