spaCy/spacy/lang/lex_attrs.py
Sofie 46dfe773e1 Replacing regex library with re to increase tokenization speed (#3218)
* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive
2019-02-01 18:05:22 +11:00

256 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
import unicodedata
import re
from .. import attrs
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
_tlds = set(
"com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
"name|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|"
"ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|"
"cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|"
"ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|"
"gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|"
"je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|"
"lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|"
"na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|"
"pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|"
"ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|"
"ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw".split("|")
)
def is_punct(text):
for char in text:
if not unicodedata.category(char).startswith("P"):
return False
return True
def is_ascii(text):
for char in text:
if ord(char) >= 128:
return False
return True
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
# can be overwritten by lang with list of number words
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
return False
def is_bracket(text):
brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
return text in brackets
def is_quote(text):
quotes = (
'"',
"'",
"`",
"«",
"»",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"''",
"``",
)
return text in quotes
def is_left_punct(text):
left_punct = (
"(",
"[",
"{",
"<",
'"',
"'",
"«",
"",
"",
"",
"",
"",
"",
"",
"",
"``",
)
return text in left_punct
def is_right_punct(text):
right_punct = (")", "]", "}", ">", '"', "'", "»", "", "", "", "", "''")
return text in right_punct
def is_currency(text):
# can be overwritten by lang with list of currency words, e.g. dollar, euro
for char in text:
if unicodedata.category(char) != "Sc":
return False
return True
def like_email(text):
return bool(_like_email(text))
def like_url(text):
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if text.startswith("http://") or text.startswith("https://"):
return True
elif text.startswith("www.") and len(text) >= 5:
return True
if text[0] == "." or text[-1] == ".":
return False
if "@" in text:
# prevent matches on e-mail addresses check after splitting the text
# to still allow URLs containing an '@' character (see #1715)
return False
for i in range(len(text)):
if text[i] == ".":
break
else:
return False
tld = text.rsplit(".", 1)[1].split(":", 1)[0]
if tld.endswith("/"):
return True
if tld.isalpha() and tld in _tlds:
return True
return False
def word_shape(text):
if len(text) >= 100:
return "LONG"
shape = []
last = ""
shape_char = ""
seq = 0
for char in text:
if char.isalpha():
if char.isupper():
shape_char = "X"
else:
shape_char = "x"
elif char.isdigit():
shape_char = "d"
else:
shape_char = char
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 4:
shape.append(shape_char)
return "".join(shape)
def lower(string):
return string.lower()
def prefix(string):
return string[0]
def suffix(string):
return string[-3:]
def cluster(string):
return 0
def is_alpha(string):
return string.isalpha()
def is_digit(string):
return string.isdigit()
def is_lower(string):
return string.islower()
def is_space(string):
return string.isspace()
def is_title(string):
return string.istitle()
def is_upper(string):
return string.isupper()
def is_stop(string, stops=set()):
return string.lower() in stops
def is_oov(string):
return True
def get_prob(string):
return -20.0
LEX_ATTRS = {
attrs.LOWER: lower,
attrs.NORM: lower,
attrs.PREFIX: prefix,
attrs.SUFFIX: suffix,
attrs.CLUSTER: cluster,
attrs.IS_ALPHA: is_alpha,
attrs.IS_DIGIT: is_digit,
attrs.IS_LOWER: is_lower,
attrs.IS_SPACE: is_space,
attrs.IS_TITLE: is_title,
attrs.IS_UPPER: is_upper,
attrs.IS_STOP: is_stop,
attrs.IS_OOV: is_oov,
attrs.PROB: get_prob,
attrs.LIKE_EMAIL: like_email,
attrs.LIKE_NUM: like_num,
attrs.IS_PUNCT: is_punct,
attrs.IS_ASCII: is_ascii,
attrs.SHAPE: word_shape,
attrs.IS_BRACKET: is_bracket,
attrs.IS_QUOTE: is_quote,
attrs.IS_LEFT_PUNCT: is_left_punct,
attrs.IS_RIGHT_PUNCT: is_right_punct,
attrs.IS_CURRENCY: is_currency,
attrs.LIKE_URL: like_url,
}