* Docs coming together

2025-12-01 07:16:12 +03:00 · 2014-08-29 01:59:23 +02:00 · 2014-08-29 01:59:23 +02:00 · 45a22d6b2c
commit 45a22d6b2c
parent c282e6d5fb
5 changed files with 47 additions and 25 deletions
--- a/docs/index.rst
+++ b/docs/index.rst
@ -9,10 +9,14 @@ spaCy NLP Tokenizer and Lexicon
 .. toctree::
    :maxdepth: 3
    
-    guide/overview
-    guide/install
+    guide/overview.rst
+    guide/install.rst
+
    api/index.rst

+    modules/index.rst
+    
+
 Source (GitHub)
 ----------------

--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -4,4 +4,4 @@ cimport cython


 cdef class English(Language):
-    cpdef int _split_one(self, unicode word)
+    cdef int _split_one(self, unicode word)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -5,22 +5,21 @@ scheme in several important respects:

 * Whitespace is added as tokens, except for single spaces. e.g.,

-    >>> [w.string for w in tokenize(u'\\nHello  \\tThere')]
+    >>> [w.string for w in EN.tokenize(u'\\nHello  \\tThere')]
    [u'\\n', u'Hello', u' ', u'\\t', u'There']

 * Contractions are normalized, e.g.

-    >>> [w.string for w in u"isn't ain't won't he's")]
+    >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
  
 * Hyphenated words are split, with the hyphen preserved, e.g.:
    
-    >>> [w.string for w in tokenize(u'New York-based')]
+    >>> [w.string for w in EN.tokenize(u'New York-based')]
    [u'New', u'York', u'-', u'based']

 Other improvements:

-* Full unicode support
 * Email addresses, URLs, European-formatted dates and other numeric entities not
  found in the PTB are tokenized correctly
 * Heuristic handling of word-final periods (PTB expects sentence boundary detection
@ -81,6 +80,13 @@ CAN_PRT = NR_FLAGS; NR_FLAGS += 1


 cdef class English(Language):
+    """English tokenizer, tightly coupled to lexicon.
+
+    Attributes:
+        name (unicode): The two letter code used by Wikipedia for the language.
+        lexicon (Lexicon): The lexicon. Exposes the lookup method.
+    """
+
    def __cinit__(self, name):
        flag_funcs = [0 for _ in range(NR_FLAGS)]
        
@ -110,7 +116,7 @@ cdef class English(Language):
        
        Language.__init__(self, name, flag_funcs)

-    cpdef int _split_one(self, unicode word):
+    cdef int _split_one(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
        if word.startswith("'s") or word.startswith("'S"):
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -4,21 +4,22 @@ from spacy.word cimport Lexeme


 cdef class Lexicon:
-    cdef list string_features
-    cdef list flag_features
-
-    cdef dict _dict
-
    cpdef Lexeme lookup(self, unicode string)
+    
+    cdef dict _dict
+    
+    cdef list _string_features
+    cdef list _flag_features


 cdef class Language:
-    cdef object name
+    cdef unicode name
    cdef dict cache
    cpdef readonly Lexicon lexicon

    cpdef list tokenize(self, unicode text)
+    cpdef Lexeme lookup(self, unicode text)

    cdef list _tokenize(self, unicode string)
-    cpdef list _split(self, unicode string)
-    cpdef int _split_one(self, unicode word)
+    cdef list _split(self, unicode string)
+    cdef int _split_one(self, unicode word)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -41,7 +41,7 @@ cdef class Language:
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
                               string_features, flag_features)
-        self.load_special_tokenization(rules)
+        self._load_special_tokenization(rules)

    cpdef list tokenize(self, unicode string):
        """Tokenize a string.
@ -75,6 +75,17 @@ cdef class Language:
        assert tokens
        return tokens

+    cpdef Lexeme lookup(self, unicode string):
+        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
+    
+        Args:
+            string (unicode): The string to be looked up. Must be unicode, not bytes.
+
+        Returns:
+            lexeme (Lexeme): A reference to a lexical type.
+        """
+        return self.lexicon.lookup(string)
+
    cdef list _tokenize(self, unicode string):
        if string in self.cache:
            return self.cache[string]
@ -85,7 +96,7 @@ cdef class Language:
        self.cache[string] = lexemes
        return lexemes

-    cpdef list _split(self, unicode string):
+    cdef list _split(self, unicode string):
        """Find how to split a contiguous span of non-space characters into substrings.

        This method calls find_split repeatedly. Most languages will want to
@ -107,10 +118,10 @@ cdef class Language:
            string = string[split:]
        return substrings

-    cpdef int _split_one(self, unicode word):
+    cdef int _split_one(self, unicode word):
        return len(word)

-    def load_special_tokenization(self, token_rules):
+    def _load_special_tokenization(self, token_rules):
        '''Load special-case tokenization rules.

        Loads special-case tokenization rules into the Language.cache cache,
@ -132,14 +143,14 @@ cdef class Language:
 cdef class Lexicon:
    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
                  string_features, flag_features):
-        self.flag_features = flag_features
-        self.string_features = string_features
+        self._flag_features = flag_features
+        self._string_features = string_features
        self._dict = {}
        cdef Lexeme word
        for string in words:
            word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
                          case_stats.get(string, {}), tag_stats.get(string, {}),
-                          self.string_features, self.flag_features)
+                          self._string_features, self._flag_features)
            self._dict[string] = word

    cpdef Lexeme lookup(self, unicode string):
@ -155,7 +166,7 @@ cdef class Lexicon:
        if string in self._dict:
            return self._dict[string]
        
-        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features,
-                                  self.flag_features)
+        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
+                                  self._flag_features)
        self._dict[string] = word
        return word