Locally strip outgoing message text respecting entities

This commit is contained in:
Lonami Exo 2018-11-19 10:15:56 +01:00
parent d854babf22
commit aaee092a46
3 changed files with 42 additions and 4 deletions

View File

@ -6,6 +6,7 @@ from collections import deque
from html import escape, unescape
from html.parser import HTMLParser
from .. import helpers
from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
MessageEntityPre, MessageEntityEmail, MessageEntityUrl,
@ -126,7 +127,8 @@ def parse(html):
parser = HTMLToTelegramParser()
parser.feed(_add_surrogate(html))
return _del_surrogate(parser.text), parser.entities
text = helpers.strip_text(parser.text, parser.entities)
return _del_surrogate(text), parser.entities
def unparse(text, entities):

View File

@ -5,7 +5,7 @@ since they seem to count as two characters and it's a bit strange.
"""
import re
from ..helpers import add_surrogate, del_surrogate
from ..helpers import add_surrogate, del_surrogate, strip_text
from ..tl import TLObject
from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
@ -125,6 +125,7 @@ def parse(message, delimiters=None, url_re=None):
+ message[current.offset:]
)
message = strip_text(message, result)
return del_surrogate(message), result

View File

@ -33,6 +33,41 @@ def del_surrogate(text):
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
def strip_text(text, entities):
"""
Strips whitespace from the given text modifying the provided entities.
This assumes that there are no overlapping entities, that their length
is greater or equal to one, and that their length is not out of bounds.
"""
if not entities:
return text.strip()
while text and text[-1].isspace():
e = entities[-1]
if e.offset + e.length == len(text):
if e.length == 1:
del entities[-1]
if not entities:
return text.strip()
else:
e.length -= 1
text = text[:-1]
while text and text[0].isspace():
e = entities[0]
if e.offset == 0:
if e.length == 1:
del entities[0]
if not entities:
return text.lstrip()
else:
e.length -= 1
text = text[1:]
return text
# endregion
# region Cryptographic related utils