replace Language functions with corresponding orth functions

implement punctuation functions in orth
2025-08-02 03:10:22 +03:00 · 2016-03-09 18:07:37 +01:00 · 2016-03-09 18:07:37 +01:00 · bc9c62e279
commit bc9c62e279
parent d9312bc9ea
2 changed files with 26 additions and 80 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -33,10 +33,6 @@ class Language(object):
    @staticmethod
    def norm(string):
        return string
-    
-    @staticmethod
-    def shape(string):
-        return orth.word_shape(string)

    @staticmethod
    def prefix(string):
@ -50,66 +46,14 @@ class Language(object):
    def cluster(string):
        return 0

-    @staticmethod
-    def is_alpha(string):
-        return orth.is_alpha(string)
-
-    @staticmethod
-    def is_ascii(string):
-        return orth.is_ascii(string)
-
    @staticmethod
    def is_digit(string):
        return string.isdigit()

-    @staticmethod
-    def is_lower(string):
-        return orth.is_lower(string)
-
-    @staticmethod
-    def is_punct(string):
-        return orth.is_punct(string)
-
    @staticmethod
    def is_space(string):
        return string.isspace()

-    @staticmethod
-    def is_title(string):
-        return orth.is_title(string)
-
-    @staticmethod
-    def is_bracket(string):
-        return orth.is_bracket(string)
-
-    @staticmethod
-    def is_quote(string):
-        return orth.is_quote(string)
-
-    @staticmethod
-    def is_left_punct(string):
-        return orth.is_left_punct(string)
-
-    @staticmethod
-    def is_right_punct(string):
-        return orth.is_right_punct(string)
-
-    @staticmethod
-    def is_upper(string):
-        return orth.is_upper(string)
-
-    @staticmethod
-    def like_url(string):
-        return orth.like_url(string)
-
-    @staticmethod
-    def like_num(string):
-        return orth.like_number(string)
-
-    @staticmethod
-    def like_email(string):
-        return orth.like_email(string)
-
    @staticmethod
    def is_stop(string):
        return 0
@ -120,26 +64,26 @@ class Language(object):
        return {
            attrs.LOWER: cls.lower,
            attrs.NORM: cls.norm,
-            attrs.SHAPE: cls.shape,
+            attrs.SHAPE: orth.word_shape,
            attrs.PREFIX: cls.prefix,
            attrs.SUFFIX: cls.suffix,
            attrs.CLUSTER: cls.cluster,
            attrs.PROB: lambda string: oov_prob,
-            attrs.IS_ALPHA: cls.is_alpha,
-            attrs.IS_ASCII: cls.is_ascii,
+            attrs.IS_ALPHA: orth.is_alpha,
+            attrs.IS_ASCII: orth.is_ascii,
            attrs.IS_DIGIT: cls.is_digit,
-            attrs.IS_LOWER: cls.is_lower,
-            attrs.IS_PUNCT: cls.is_punct,
+            attrs.IS_LOWER: orth.is_lower,
+            attrs.IS_PUNCT: orth.is_punct,
            attrs.IS_SPACE: cls.is_space,
-            attrs.IS_TITLE: cls.is_title,
-            attrs.IS_UPPER: cls.is_upper,
-            attrs.FLAG14: cls.is_bracket,
-            attrs.FLAG15: cls.is_quote,
-            attrs.FLAG16: cls.is_left_punct,
-            attrs.FLAG17: cls.is_right_punct,
-            attrs.LIKE_URL: cls.like_url,
-            attrs.LIKE_NUM: cls.like_num,
-            attrs.LIKE_EMAIL: cls.like_email,
+            attrs.IS_TITLE: orth.is_title,
+            attrs.IS_UPPER: orth.is_upper,
+            attrs.FLAG14: orth.is_bracket,
+            attrs.FLAG15: orth.is_quote,
+            attrs.FLAG16: orth.is_left_punct,
+            attrs.FLAG17: orth.is_right_punct,
+            attrs.LIKE_URL: orth.like_url,
+            attrs.LIKE_NUM: orth.like_number,
+            attrs.LIKE_EMAIL: orth.like_email,
            attrs.IS_STOP: cls.is_stop,
            attrs.IS_OOV: lambda string: True
        }
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -5,9 +5,6 @@ import unicodedata
 import re


-TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
-
-
 # Binary string features
 cpdef bint is_alpha(unicode string):
    return string.isalpha()
@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string):
    else:
        return True

+
 cpdef bint is_bracket(unicode string):
-    return False
+    brackets = ('(',')','[',']','{','}','<','>')
+    return string in brackets
+

 cpdef bint is_quote(unicode string):
-    if string in ('"', "'"):
-        return True
-    else:
-        return False
+    quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯')
+    return string in quotes
+

 cpdef bint is_left_punct(unicode string):
-    return False
+    left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮')        
+    return string in left_punct
+

 cpdef bint is_right_punct(unicode string):
-    return False
+    right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯')        
+    return string in right_punct


 cpdef bint is_title(unicode string):