replace Language functions with corresponding orth functions

implement punctuation functions in orth
This commit is contained in:
Wolfgang Seeker 2016-03-09 18:07:37 +01:00
parent d9312bc9ea
commit bc9c62e279
2 changed files with 26 additions and 80 deletions

View File

@ -34,10 +34,6 @@ class Language(object):
def norm(string): def norm(string):
return string return string
@staticmethod
def shape(string):
return orth.word_shape(string)
@staticmethod @staticmethod
def prefix(string): def prefix(string):
return string[0] return string[0]
@ -50,66 +46,14 @@ class Language(object):
def cluster(string): def cluster(string):
return 0 return 0
@staticmethod
def is_alpha(string):
return orth.is_alpha(string)
@staticmethod
def is_ascii(string):
return orth.is_ascii(string)
@staticmethod @staticmethod
def is_digit(string): def is_digit(string):
return string.isdigit() return string.isdigit()
@staticmethod
def is_lower(string):
return orth.is_lower(string)
@staticmethod
def is_punct(string):
return orth.is_punct(string)
@staticmethod @staticmethod
def is_space(string): def is_space(string):
return string.isspace() return string.isspace()
@staticmethod
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_bracket(string):
return orth.is_bracket(string)
@staticmethod
def is_quote(string):
return orth.is_quote(string)
@staticmethod
def is_left_punct(string):
return orth.is_left_punct(string)
@staticmethod
def is_right_punct(string):
return orth.is_right_punct(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@staticmethod
def like_url(string):
return orth.like_url(string)
@staticmethod
def like_num(string):
return orth.like_number(string)
@staticmethod
def like_email(string):
return orth.like_email(string)
@staticmethod @staticmethod
def is_stop(string): def is_stop(string):
return 0 return 0
@ -120,26 +64,26 @@ class Language(object):
return { return {
attrs.LOWER: cls.lower, attrs.LOWER: cls.lower,
attrs.NORM: cls.norm, attrs.NORM: cls.norm,
attrs.SHAPE: cls.shape, attrs.SHAPE: orth.word_shape,
attrs.PREFIX: cls.prefix, attrs.PREFIX: cls.prefix,
attrs.SUFFIX: cls.suffix, attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster, attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: oov_prob, attrs.PROB: lambda string: oov_prob,
attrs.IS_ALPHA: cls.is_alpha, attrs.IS_ALPHA: orth.is_alpha,
attrs.IS_ASCII: cls.is_ascii, attrs.IS_ASCII: orth.is_ascii,
attrs.IS_DIGIT: cls.is_digit, attrs.IS_DIGIT: cls.is_digit,
attrs.IS_LOWER: cls.is_lower, attrs.IS_LOWER: orth.is_lower,
attrs.IS_PUNCT: cls.is_punct, attrs.IS_PUNCT: orth.is_punct,
attrs.IS_SPACE: cls.is_space, attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title, attrs.IS_TITLE: orth.is_title,
attrs.IS_UPPER: cls.is_upper, attrs.IS_UPPER: orth.is_upper,
attrs.FLAG14: cls.is_bracket, attrs.FLAG14: orth.is_bracket,
attrs.FLAG15: cls.is_quote, attrs.FLAG15: orth.is_quote,
attrs.FLAG16: cls.is_left_punct, attrs.FLAG16: orth.is_left_punct,
attrs.FLAG17: cls.is_right_punct, attrs.FLAG17: orth.is_right_punct,
attrs.LIKE_URL: cls.like_url, attrs.LIKE_URL: orth.like_url,
attrs.LIKE_NUM: cls.like_num, attrs.LIKE_NUM: orth.like_number,
attrs.LIKE_EMAIL: cls.like_email, attrs.LIKE_EMAIL: orth.like_email,
attrs.IS_STOP: cls.is_stop, attrs.IS_STOP: cls.is_stop,
attrs.IS_OOV: lambda string: True attrs.IS_OOV: lambda string: True
} }

View File

@ -5,9 +5,6 @@ import unicodedata
import re import re
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
# Binary string features # Binary string features
cpdef bint is_alpha(unicode string): cpdef bint is_alpha(unicode string):
return string.isalpha() return string.isalpha()
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
else: else:
return True return True
cpdef bint is_bracket(unicode string): cpdef bint is_bracket(unicode string):
return False brackets = ('(',')','[',']','{','}','<','>')
return string in brackets
cpdef bint is_quote(unicode string): cpdef bint is_quote(unicode string):
if string in ('"', "'"): quotes = ('"',"'",'`','«','»','','','','','','','','','','','','')
return True return string in quotes
else:
return False
cpdef bint is_left_punct(unicode string): cpdef bint is_left_punct(unicode string):
return False left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','')
return string in left_punct
cpdef bint is_right_punct(unicode string): cpdef bint is_right_punct(unicode string):
return False right_punct = (')',']','}','>','"',"'",'»','','','','')
return string in right_punct
cpdef bint is_title(unicode string): cpdef bint is_title(unicode string):