Remove text stripping

This commit is contained in:
Lonami Exo 2024-03-16 19:32:27 +01:00
parent 033b56f1d3
commit 0e48a01ef4
2 changed files with 3 additions and 54 deletions

View File

@ -30,7 +30,7 @@ from ...tl.types import (
MessageEntityUnderline, MessageEntityUnderline,
MessageEntityUrl, MessageEntityUrl,
) )
from .strings import add_surrogate, del_surrogate, strip_text, within_surrogate from .strings import add_surrogate, del_surrogate, within_surrogate
class HTMLToTelegramParser(HTMLParser): class HTMLToTelegramParser(HTMLParser):
@ -141,8 +141,7 @@ def parse(html: str) -> Tuple[str, List[MessageEntity]]:
parser = HTMLToTelegramParser() parser = HTMLToTelegramParser()
parser.feed(add_surrogate(html)) parser.feed(add_surrogate(html))
text = strip_text(parser.text, parser.entities) return del_surrogate(parser.text), parser.entities
return del_surrogate(text), parser.entities
ENTITY_TO_FORMATTER: Dict[ ENTITY_TO_FORMATTER: Dict[

View File

@ -1,7 +1,5 @@
import struct import struct
from typing import List, Optional from typing import Optional
from ...tl.abcs import MessageEntity
def add_surrogate(text: str) -> str: def add_surrogate(text: str) -> str:
@ -33,51 +31,3 @@ def within_surrogate(text: str, index: int, *, length: Optional[int] = None) ->
and "\ud800" <= text[index - 1] <= "\udfff" # previous is and "\ud800" <= text[index - 1] <= "\udfff" # previous is
and "\ud800" <= text[index] <= "\udfff" # current is and "\ud800" <= text[index] <= "\udfff" # current is
) )
def strip_text(text: str, entities: List[MessageEntity]) -> str:
"""
Strips whitespace from the given text modifying the provided entities.
This assumes that there are no overlapping entities, that their length
is greater or equal to one, and that their length is not out of bounds.
"""
if not entities:
return text.strip()
assert all(isinstance(getattr(e, "offset"), int) for e in entities)
while text and text[-1].isspace():
e = entities[-1]
offset, length = getattr(e, "offset", None), getattr(e, "length", None)
assert isinstance(offset, int) and isinstance(length, int)
if offset + length == len(text):
if length == 1:
del entities[-1]
if not entities:
return text.strip()
else:
length -= 1
text = text[:-1]
while text and text[0].isspace():
for i in reversed(range(len(entities))):
e = entities[i]
offset, length = getattr(e, "offset", None), getattr(e, "length", None)
assert isinstance(offset, int) and isinstance(length, int)
if offset != 0:
setattr(e, "offset", offset - 1)
continue
if length == 1:
del entities[0]
if not entities:
return text.lstrip()
else:
setattr(e, "length", length - 1)
text = text[1:]
return text