mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Fix ptb3 module
This commit is contained in:
		
							parent
							
								
									a22101404a
								
							
						
					
					
						commit
						e289896603
					
				
							
								
								
									
										14
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								spacy/en.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -3,7 +3,7 @@
 | 
				
			||||||
'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
 | 
					'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
 | 
				
			||||||
scheme in several important respects:
 | 
					scheme in several important respects:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Whitespace added as tokens, except for single spaces. e.g.,
 | 
					* Whitespace is added as tokens, except for single spaces. e.g.,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    >>> tokenize(u'\\nHello  \\tThere').strings
 | 
					    >>> tokenize(u'\\nHello  \\tThere').strings
 | 
				
			||||||
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 | 
					    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 | 
				
			||||||
| 
						 | 
					@ -18,13 +18,15 @@ scheme in several important respects:
 | 
				
			||||||
    >>> tokenize(u'New York-based').strings
 | 
					    >>> tokenize(u'New York-based').strings
 | 
				
			||||||
    [u'New', u'York', u'-', u'based']
 | 
					    [u'New', u'York', u'-', u'based']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Other improvements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Full unicode support
 | 
					* Full unicode support
 | 
				
			||||||
* Email addresses, URLs, European-formatted dates and other numeric entities not
 | 
					* Email addresses, URLs, European-formatted dates and other numeric entities not
 | 
				
			||||||
  found in the PTB are tokenized correctly
 | 
					  found in the PTB are tokenized correctly
 | 
				
			||||||
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
 | 
					* Heuristic handling of word-final periods (PTB expects sentence boundary detection
 | 
				
			||||||
  as a pre-process before tokenization.)
 | 
					  as a pre-process before tokenization.)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Take care to ensure you training and run-time data is tokenized according to the
 | 
					Take care to ensure your training and run-time data is tokenized according to the
 | 
				
			||||||
same scheme. Tokenization problems are a major cause of poor performance for
 | 
					same scheme. Tokenization problems are a major cause of poor performance for
 | 
				
			||||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 | 
					NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 | 
				
			||||||
provides a fully Penn Treebank 3-compliant tokenizer.
 | 
					provides a fully Penn Treebank 3-compliant tokenizer.
 | 
				
			||||||
| 
						 | 
					@ -49,7 +51,6 @@ from .orthography.latin import *
 | 
				
			||||||
from .lexeme import *
 | 
					from .lexeme import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class English(spacy.Language):
 | 
					cdef class English(spacy.Language):
 | 
				
			||||||
    # How to ensure the order here aligns with orthography.latin?
 | 
					    # How to ensure the order here aligns with orthography.latin?
 | 
				
			||||||
    view_funcs = [
 | 
					    view_funcs = [
 | 
				
			||||||
| 
						 | 
					@ -101,7 +102,7 @@ cpdef Tokens tokenize(unicode string):
 | 
				
			||||||
    The tokenization rules are defined in two places:
 | 
					    The tokenization rules are defined in two places:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    * The data/en/tokenization table, which handles special cases like contractions;
 | 
					    * The data/en/tokenization table, which handles special cases like contractions;
 | 
				
			||||||
    * The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
 | 
					    * The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Args:
 | 
					    Args:
 | 
				
			||||||
        string (unicode): The string to be tokenized. 
 | 
					        string (unicode): The string to be tokenized. 
 | 
				
			||||||
| 
						 | 
					@ -113,9 +114,10 @@ cpdef Tokens tokenize(unicode string):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef LexID lookup(unicode string) except 0:
 | 
					cpdef LexID lookup(unicode string) except 0:
 | 
				
			||||||
    """Retrieve (or create, if not found) a Lexeme ID for a string.
 | 
					    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    The LexID is really a memory address, making dereferencing it essentially free.
 | 
					    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
 | 
				
			||||||
 | 
					    Access is cheap/free, as the LexID is the memory address of the Lexeme.
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    Args:
 | 
					    Args:
 | 
				
			||||||
        string (unicode):  The string to be looked up. Must be unicode, not bytes.
 | 
					        string (unicode):  The string to be looked up. Must be unicode, not bytes.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,10 +25,15 @@ cdef struct Lexeme:
 | 
				
			||||||
cpdef StringHash lex_of(LexID lex_id) except 0
 | 
					cpdef StringHash lex_of(LexID lex_id) except 0
 | 
				
			||||||
cpdef char first_of(LexID lex_id) except 0
 | 
					cpdef char first_of(LexID lex_id) except 0
 | 
				
			||||||
cpdef size_t length_of(LexID lex_id) except 0
 | 
					cpdef size_t length_of(LexID lex_id) except 0
 | 
				
			||||||
cpdef double prob_of(LexID lex_id) except 0
 | 
					cpdef double prob_of(LexID lex_id) except 1
 | 
				
			||||||
cpdef ClusterID cluster_of(LexID lex_id) except 0
 | 
					cpdef ClusterID cluster_of(LexID lex_id) except 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
 | 
					
 | 
				
			||||||
 | 
					cpdef bint is_often_titled(size_t lex_id)
 | 
				
			||||||
 | 
					cpdef bint is_often_uppered(size_t lex_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cpdef bint can_tag(LexID lex, TagFlags flag) except *
 | 
				
			||||||
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
 | 
					cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
 | 
				
			||||||
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
 | 
					cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,6 +11,21 @@ from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.spacy cimport StringHash
 | 
					from spacy.spacy cimport StringHash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Python-visible enum for POS tags
 | 
				
			||||||
 | 
					PUNCT = 0
 | 
				
			||||||
 | 
					CONJ = 1
 | 
				
			||||||
 | 
					NUM = 2
 | 
				
			||||||
 | 
					X = 3
 | 
				
			||||||
 | 
					DET = 4
 | 
				
			||||||
 | 
					ADP = 5
 | 
				
			||||||
 | 
					ADJ = 6
 | 
				
			||||||
 | 
					ADV = 7
 | 
				
			||||||
 | 
					VERB = 8
 | 
				
			||||||
 | 
					NOUN = 9
 | 
				
			||||||
 | 
					PDT = 10
 | 
				
			||||||
 | 
					POS = 11
 | 
				
			||||||
 | 
					PRON = 12
 | 
				
			||||||
 | 
					PRT = 13
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef int set_flags(LexID lex_id, object active_flags) except *:
 | 
					cpdef int set_flags(LexID lex_id, object active_flags) except *:
 | 
				
			||||||
    """Set orthographic bit flags for a Lexeme.
 | 
					    """Set orthographic bit flags for a Lexeme.
 | 
				
			||||||
| 
						 | 
					@ -75,7 +90,7 @@ cpdef size_t length_of(size_t lex_id) except 0:
 | 
				
			||||||
    return word.length
 | 
					    return word.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef double prob_of(size_t lex_id) except 0:
 | 
					cpdef double prob_of(size_t lex_id) except 1:
 | 
				
			||||||
    '''Access an estimate of the word's unigram log probability.
 | 
					    '''Access an estimate of the word's unigram log probability.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Probabilities are calculated from a large text corpus, and smoothed using
 | 
					    Probabilities are calculated from a large text corpus, and smoothed using
 | 
				
			||||||
| 
						 | 
					@ -90,7 +105,7 @@ cpdef double prob_of(size_t lex_id) except 0:
 | 
				
			||||||
DEF OFT_UPPER = 1
 | 
					DEF OFT_UPPER = 1
 | 
				
			||||||
DEF OFT_TITLE = 2
 | 
					DEF OFT_TITLE = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef bint is_oft_upper(size_t lex_id):
 | 
					cpdef bint is_often_uppered(size_t lex_id):
 | 
				
			||||||
    '''Check the OFT_UPPER distributional flag for the word.
 | 
					    '''Check the OFT_UPPER distributional flag for the word.
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    The OFT_UPPER flag records whether a lower-cased version of the word
 | 
					    The OFT_UPPER flag records whether a lower-cased version of the word
 | 
				
			||||||
| 
						 | 
					@ -101,15 +116,15 @@ cpdef bint is_oft_upper(size_t lex_id):
 | 
				
			||||||
    Case statistics are estimated from a large text corpus. Estimates are read
 | 
					    Case statistics are estimated from a large text corpus. Estimates are read
 | 
				
			||||||
    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
 | 
					    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    >>> is_oft_upper(lookup(u'nato'))
 | 
					    >>> is_often_uppered(lookup(u'nato'))
 | 
				
			||||||
    True
 | 
					    True
 | 
				
			||||||
    >>> is_oft_upper(lookup(u'the')) 
 | 
					    >>> is_often_uppered(lookup(u'the')) 
 | 
				
			||||||
    False
 | 
					    False
 | 
				
			||||||
    '''
 | 
					    '''
 | 
				
			||||||
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
 | 
					    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef bint is_oft_title(size_t lex_id):
 | 
					cpdef bint is_often_titled(size_t lex_id):
 | 
				
			||||||
    '''Check the OFT_TITLE distributional flag for the word.
 | 
					    '''Check the OFT_TITLE distributional flag for the word.
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    The OFT_TITLE flag records whether a lower-cased version of the word
 | 
					    The OFT_TITLE flag records whether a lower-cased version of the word
 | 
				
			||||||
| 
						 | 
					@ -127,6 +142,7 @@ cpdef bint is_oft_title(size_t lex_id):
 | 
				
			||||||
    '''
 | 
					    '''
 | 
				
			||||||
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
 | 
					    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
 | 
					cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
 | 
				
			||||||
    return (<Lexeme*>lex_id).orth_flags & (1 << flag)
 | 
					    return (<Lexeme*>lex_id).orth_flags & (1 << flag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -135,5 +151,5 @@ cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
 | 
				
			||||||
    return (<Lexeme*>lex_id).dist_flags & (1 << flag)
 | 
					    return (<Lexeme*>lex_id).dist_flags & (1 << flag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
 | 
					cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
 | 
				
			||||||
    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
 | 
					    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,15 @@
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.spacy cimport StringHash
 | 
					 | 
				
			||||||
from spacy.spacy cimport Language
 | 
					from spacy.spacy cimport Language
 | 
				
			||||||
from spacy.spacy cimport Lexeme
 | 
					from spacy.lexeme cimport LexID
 | 
				
			||||||
from spacy.spacy cimport Lexeme_addr
 | 
					 | 
				
			||||||
from spacy.tokens cimport Tokens
 | 
					from spacy.tokens cimport Tokens
 | 
				
			||||||
 | 
					from spacy.lexeme cimport StringHash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EnglishPTB(Language):
 | 
					cdef class PennTreebank3(Language):
 | 
				
			||||||
    cdef int find_split(self, unicode word)
 | 
					    cpdef list find_substrings(self, unicode word)
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef EnglishPTB EN_PTB
 | 
					cdef PennTreebank3 PTB3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Lexeme_addr lookup(unicode word) except 0
 | 
					cpdef LexID lookup(unicode word) except 0
 | 
				
			||||||
cpdef Tokens tokenize(unicode string)
 | 
					cpdef Tokens tokenize(unicode string)
 | 
				
			||||||
cpdef unicode unhash(StringHash hash_value)
 | 
					cpdef unicode unhash(StringHash hash_value)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										106
									
								
								spacy/ptb3.pyx
									
									
									
									
									
								
							
							
						
						
									
										106
									
								
								spacy/ptb3.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -7,55 +7,89 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.stdlib cimport malloc, calloc, free
 | 
					from libc.stdlib cimport malloc, calloc, free
 | 
				
			||||||
from libc.stdint cimport uint64_t
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.string_tools cimport substr
 | 
					 | 
				
			||||||
from spacy.spacy cimport Language
 | 
					 | 
				
			||||||
from . import util
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport spacy
 | 
					cimport spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EnglishPTB(Language):
 | 
					# List of contractions adapted from Robert MacIntyre's tokenizer.
 | 
				
			||||||
    cdef int find_split(self, unicode word):
 | 
					CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
 | 
				
			||||||
        length = len(word)
 | 
					                 re.compile(r"(?i)\b(d)('ye)\b"),
 | 
				
			||||||
        cdef int i = 0
 | 
					                 re.compile(r"(?i)\b(gim)(me)\b"),
 | 
				
			||||||
        # Contractions
 | 
					                 re.compile(r"(?i)\b(gon)(na)\b"),
 | 
				
			||||||
        if word.endswith("'s"):
 | 
					                 re.compile(r"(?i)\b(got)(ta)\b"),
 | 
				
			||||||
            return length - 2
 | 
					                 re.compile(r"(?i)\b(lem)(me)\b"),
 | 
				
			||||||
        # Leading punctuation
 | 
					                 re.compile(r"(?i)\b(mor)('n)\b"),
 | 
				
			||||||
        if is_punct(word, 0, length):
 | 
					                 re.compile(r"(?i)\b(wan)(na) ")]
 | 
				
			||||||
            return 1
 | 
					
 | 
				
			||||||
        elif length >= 1:
 | 
					CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
 | 
				
			||||||
            # Split off all trailing punctuation characters
 | 
					                 re.compile(r"(?i) ('t)(was)\b")]
 | 
				
			||||||
            i = 0
 | 
					
 | 
				
			||||||
            while i < length and not is_punct(word, i, length):
 | 
					CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
 | 
				
			||||||
                i += 1
 | 
					                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]
 | 
				
			||||||
        return i
 | 
					
 | 
				
			||||||
 | 
					def nltk_regex_tokenize(text):
 | 
				
			||||||
 | 
					    # Implementation taken from NLTK 3.0, based on tokenizer.sed
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    #starting quotes
 | 
				
			||||||
 | 
					    text = re.sub(r'^\"', r'``', text)
 | 
				
			||||||
 | 
					    text = re.sub(r'(``)', r' \1 ', text)
 | 
				
			||||||
 | 
					    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #punctuation
 | 
				
			||||||
 | 
					    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
 | 
				
			||||||
 | 
					    text = re.sub(r'\.\.\.', r' ... ', text)
 | 
				
			||||||
 | 
					    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
 | 
				
			||||||
 | 
					    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
 | 
				
			||||||
 | 
					    text = re.sub(r'[?!]', r' \g<0> ', text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    text = re.sub(r"([^'])' ", r"\1 ' ", text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #parens, brackets, etc.
 | 
				
			||||||
 | 
					    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
 | 
				
			||||||
 | 
					    text = re.sub(r'--', r' -- ', text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #add extra space to make things easier
 | 
				
			||||||
 | 
					    text = " " + text + " "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #ending quotes
 | 
				
			||||||
 | 
					    text = re.sub(r'"', " '' ", text)
 | 
				
			||||||
 | 
					    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
 | 
				
			||||||
 | 
					    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
 | 
				
			||||||
 | 
					                  text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for regexp in CONTRACTIONS2:
 | 
				
			||||||
 | 
					        text = regexp.sub(r' \1 \2 ', text)
 | 
				
			||||||
 | 
					    for regexp in CONTRACTIONS3:
 | 
				
			||||||
 | 
					        text = regexp.sub(r' \1 \2 ', text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # We are not using CONTRACTIONS4 since
 | 
				
			||||||
 | 
					    # they are also commented out in the SED scripts
 | 
				
			||||||
 | 
					    # for regexp in self.CONTRACTIONS4:
 | 
				
			||||||
 | 
					    #     text = regexp.sub(r' \1 \2 \3 ', text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return text.split()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef bint is_punct(unicode word, size_t i, size_t length):
 | 
					cdef class PennTreebank3(Language):
 | 
				
			||||||
    is_final = i == (length - 1)
 | 
					    cpdef list find_substrings(self, unicode chunk):
 | 
				
			||||||
    if word[i] == '.':
 | 
					        strings = nltk_regex_tokenize(chunk)
 | 
				
			||||||
        return False
 | 
					        assert strings
 | 
				
			||||||
    if not is_final and word[i] == '-' and word[i+1] == '-':
 | 
					        return strings
 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    # Don't count appostrophes as punct if the next char is a letter
 | 
					 | 
				
			||||||
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
    punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
 | 
					 | 
				
			||||||
    return word[i] in punct_chars
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
 | 
					cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Tokens tokenize(unicode string):
 | 
					cpdef Tokens tokenize(unicode string):
 | 
				
			||||||
    return EN_PTB.tokenize(string)
 | 
					    return PTB3.tokenize(string)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
 | 
					cpdef LexID lookup(unicode string) except 0:
 | 
				
			||||||
    return <Lexeme_addr>EN_PTB.lookup(string)
 | 
					    return <LexID>PTB3.lookup(string)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef unicode unhash(StringHash hash_value):
 | 
					cpdef unicode unhash(StringHash hash_value):
 | 
				
			||||||
    return EN_PTB.unhash(hash_value)
 | 
					    return PTB3.unhash(hash_value)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user