spaCy/spacy/lang/lex_attrs.py
idoshr b10c7bc56e
Hebrew like num (#5952)
* Update stop_words.py

Hebrew STOP WORDS

* Update stop_words.py

* contributor

* contributor

* add some common domain extentions
support human number 1K/1M....

* support human number 1K/1M....

* hebrew number tokenize
1K/1M implement in EN

* test human tokenize fix

* test

* heb like num
revert human number change

* heb like num
2020-08-24 14:30:05 +02:00

241 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
import unicodedata
import re
from .. import attrs
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
_tlds = set(
"com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
"name|pro|tel|travel|xyz|icu|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|"
"ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|"
"cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|"
"ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|"
"gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|"
"je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|"
"lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|"
"na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|"
"pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|"
"ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|"
"ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw".split("|")
)
def is_punct(text):
for char in text:
if not unicodedata.category(char).startswith("P"):
return False
return True
def is_ascii(text):
for char in text:
if ord(char) >= 128:
return False
return True
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
# can be overwritten by lang with list of number words
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
return False
def is_bracket(text):
brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
return text in brackets
def is_quote(text):
quotes = (
'"',
"'",
"`",
"«",
"»",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"''",
"``",
)
return text in quotes
def is_left_punct(text):
left_punct = (
"(",
"[",
"{",
"<",
'"',
"'",
"«",
"",
"",
"",
"",
"",
"",
"",
"",
"``",
)
return text in left_punct
def is_right_punct(text):
right_punct = (")", "]", "}", ">", '"', "'", "»", "", "", "", "", "''")
return text in right_punct
def is_currency(text):
# can be overwritten by lang with list of currency words, e.g. dollar, euro
for char in text:
if unicodedata.category(char) != "Sc":
return False
return True
def like_email(text):
return bool(_like_email(text))
def like_url(text):
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if text.startswith("http://") or text.startswith("https://"):
return True
elif text.startswith("www.") and len(text) >= 5:
return True
if text[0] == "." or text[-1] == ".":
return False
if "@" in text:
# prevent matches on e-mail addresses check after splitting the text
# to still allow URLs containing an '@' character (see #1715)
return False
for i in range(len(text)):
if text[i] == ".":
break
else:
return False
tld = text.rsplit(".", 1)[1].split(":", 1)[0]
if tld.endswith("/"):
return True
if tld.isalpha() and tld in _tlds:
return True
return False
def word_shape(text):
if len(text) >= 100:
return "LONG"
shape = []
last = ""
shape_char = ""
seq = 0
for char in text:
if char.isalpha():
if char.isupper():
shape_char = "X"
else:
shape_char = "x"
elif char.isdigit():
shape_char = "d"
else:
shape_char = char
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 4:
shape.append(shape_char)
return "".join(shape)
def lower(string):
return string.lower()
def prefix(string):
return string[0]
def suffix(string):
return string[-3:]
def is_alpha(string):
return string.isalpha()
def is_digit(string):
return string.isdigit()
def is_lower(string):
return string.islower()
def is_space(string):
return string.isspace()
def is_title(string):
return string.istitle()
def is_upper(string):
return string.isupper()
def is_stop(string, stops=set()):
return string.lower() in stops
LEX_ATTRS = {
attrs.LOWER: lower,
attrs.NORM: lower,
attrs.PREFIX: prefix,
attrs.SUFFIX: suffix,
attrs.IS_ALPHA: is_alpha,
attrs.IS_DIGIT: is_digit,
attrs.IS_LOWER: is_lower,
attrs.IS_SPACE: is_space,
attrs.IS_TITLE: is_title,
attrs.IS_UPPER: is_upper,
attrs.IS_STOP: is_stop,
attrs.LIKE_EMAIL: like_email,
attrs.LIKE_NUM: like_num,
attrs.IS_PUNCT: is_punct,
attrs.IS_ASCII: is_ascii,
attrs.SHAPE: word_shape,
attrs.IS_BRACKET: is_bracket,
attrs.IS_QUOTE: is_quote,
attrs.IS_LEFT_PUNCT: is_left_punct,
attrs.IS_RIGHT_PUNCT: is_right_punct,
attrs.IS_CURRENCY: is_currency,
attrs.LIKE_URL: like_url,
}