replace Language functions with corresponding orth functions

implement punctuation functions in orth
This commit is contained in:
Wolfgang Seeker 2016-03-09 18:07:37 +01:00
parent d9312bc9ea
commit bc9c62e279
2 changed files with 26 additions and 80 deletions

View File

@ -33,10 +33,6 @@ class Language(object):
@staticmethod
def norm(string):
return string
@staticmethod
def shape(string):
return orth.word_shape(string)
@staticmethod
def prefix(string):
@ -50,66 +46,14 @@ class Language(object):
def cluster(string):
return 0
@staticmethod
def is_alpha(string):
return orth.is_alpha(string)
@staticmethod
def is_ascii(string):
return orth.is_ascii(string)
@staticmethod
def is_digit(string):
return string.isdigit()
@staticmethod
def is_lower(string):
return orth.is_lower(string)
@staticmethod
def is_punct(string):
return orth.is_punct(string)
@staticmethod
def is_space(string):
return string.isspace()
@staticmethod
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_bracket(string):
return orth.is_bracket(string)
@staticmethod
def is_quote(string):
return orth.is_quote(string)
@staticmethod
def is_left_punct(string):
return orth.is_left_punct(string)
@staticmethod
def is_right_punct(string):
return orth.is_right_punct(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@staticmethod
def like_url(string):
return orth.like_url(string)
@staticmethod
def like_num(string):
return orth.like_number(string)
@staticmethod
def like_email(string):
return orth.like_email(string)
@staticmethod
def is_stop(string):
return 0
@ -120,26 +64,26 @@ class Language(object):
return {
attrs.LOWER: cls.lower,
attrs.NORM: cls.norm,
attrs.SHAPE: cls.shape,
attrs.SHAPE: orth.word_shape,
attrs.PREFIX: cls.prefix,
attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: oov_prob,
attrs.IS_ALPHA: cls.is_alpha,
attrs.IS_ASCII: cls.is_ascii,
attrs.IS_ALPHA: orth.is_alpha,
attrs.IS_ASCII: orth.is_ascii,
attrs.IS_DIGIT: cls.is_digit,
attrs.IS_LOWER: cls.is_lower,
attrs.IS_PUNCT: cls.is_punct,
attrs.IS_LOWER: orth.is_lower,
attrs.IS_PUNCT: orth.is_punct,
attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title,
attrs.IS_UPPER: cls.is_upper,
attrs.FLAG14: cls.is_bracket,
attrs.FLAG15: cls.is_quote,
attrs.FLAG16: cls.is_left_punct,
attrs.FLAG17: cls.is_right_punct,
attrs.LIKE_URL: cls.like_url,
attrs.LIKE_NUM: cls.like_num,
attrs.LIKE_EMAIL: cls.like_email,
attrs.IS_TITLE: orth.is_title,
attrs.IS_UPPER: orth.is_upper,
attrs.FLAG14: orth.is_bracket,
attrs.FLAG15: orth.is_quote,
attrs.FLAG16: orth.is_left_punct,
attrs.FLAG17: orth.is_right_punct,
attrs.LIKE_URL: orth.like_url,
attrs.LIKE_NUM: orth.like_number,
attrs.LIKE_EMAIL: orth.like_email,
attrs.IS_STOP: cls.is_stop,
attrs.IS_OOV: lambda string: True
}

View File

@ -5,9 +5,6 @@ import unicodedata
import re
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
# Binary string features
cpdef bint is_alpha(unicode string):
return string.isalpha()
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
else:
return True
cpdef bint is_bracket(unicode string):
return False
brackets = ('(',')','[',']','{','}','<','>')
return string in brackets
cpdef bint is_quote(unicode string):
if string in ('"', "'"):
return True
else:
return False
quotes = ('"',"'",'`','«','»','','','','','','','','','','','','')
return string in quotes
cpdef bint is_left_punct(unicode string):
return False
left_punct = ('(','[','{','<','"',"'",'«','','','','','','','','')
return string in left_punct
cpdef bint is_right_punct(unicode string):
return False
right_punct = (')',']','}','>','"',"'",'»','','','','')
return string in right_punct
cpdef bint is_title(unicode string):