* Docs coming together

2025-07-21 21:49:49 +03:00 · 2014-08-29 01:59:23 +02:00 · 2014-08-29 01:59:23 +02:00 · 45a22d6b2c
commit 45a22d6b2c
parent c282e6d5fb
5 changed files with 47 additions and 25 deletions
--- a/docs/index.rst
+++ b/docs/index.rst
@ -9,10 +9,14 @@ spaCy NLP Tokenizer and Lexicon
 .. toctree::
    :maxdepth: 3
-    guide/overview
+    guide/overview.rst
-    guide/install
+    guide/install.rst
    api/index.rst
    modules/index.rst
 Source (GitHub)
 ----------------
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -4,4 +4,4 @@ cimport cython
 cdef class English(Language):
-    cpdef int _split_one(self, unicode word)
+    cdef int _split_one(self, unicode word)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -5,22 +5,21 @@ scheme in several important respects:
 * Whitespace is added as tokens, except for single spaces. e.g.,
-    >>> [w.string for w in tokenize(u'\\nHello  \\tThere')]
+    >>> [w.string for w in EN.tokenize(u'\\nHello  \\tThere')]
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 * Contractions are normalized, e.g.
-    >>> [w.string for w in u"isn't ain't won't he's")]
+    >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
 * Hyphenated words are split, with the hyphen preserved, e.g.:
-    >>> [w.string for w in tokenize(u'New York-based')]
+    >>> [w.string for w in EN.tokenize(u'New York-based')]
    [u'New', u'York', u'-', u'based']
 Other improvements:
 * Full unicode support
 * Email addresses, URLs, European-formatted dates and other numeric entities not
  found in the PTB are tokenized correctly
 * Heuristic handling of word-final periods (PTB expects sentence boundary detection
@ -81,6 +80,13 @@ CAN_PRT = NR_FLAGS; NR_FLAGS += 1
 cdef class English(Language):
    """English tokenizer, tightly coupled to lexicon.
    Attributes:
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
    def __cinit__(self, name):
        flag_funcs = [0 for _ in range(NR_FLAGS)]
@ -110,7 +116,7 @@ cdef class English(Language):
        Language.__init__(self, name, flag_funcs)
-    cpdef int _split_one(self, unicode word):
+    cdef int _split_one(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
        if word.startswith("'s") or word.startswith("'S"):
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -4,21 +4,22 @@ from spacy.word cimport Lexeme
 cdef class Lexicon:
    cdef list string_features
    cdef list flag_features
    cdef dict _dict
    cpdef Lexeme lookup(self, unicode string)
    cdef dict _dict
    cdef list _string_features
    cdef list _flag_features
 cdef class Language:
-    cdef object name
+    cdef unicode name
    cdef dict cache
    cpdef readonly Lexicon lexicon
    cpdef list tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)
    cdef list _tokenize(self, unicode string)
-    cpdef list _split(self, unicode string)
+    cdef list _split(self, unicode string)
-    cpdef int _split_one(self, unicode word)
+    cdef int _split_one(self, unicode word)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -41,7 +41,7 @@ cdef class Language:
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
                               string_features, flag_features)
-        self.load_special_tokenization(rules)
+        self._load_special_tokenization(rules)
    cpdef list tokenize(self, unicode string):
        """Tokenize a string.
@ -75,6 +75,17 @@ cdef class Language:
        assert tokens
        return tokens
    cpdef Lexeme lookup(self, unicode string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
        Args:
            string (unicode): The string to be looked up. Must be unicode, not bytes.
        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
        return self.lexicon.lookup(string)
    cdef list _tokenize(self, unicode string):
        if string in self.cache:
            return self.cache[string]
@ -85,7 +96,7 @@ cdef class Language:
        self.cache[string] = lexemes
        return lexemes
-    cpdef list _split(self, unicode string):
+    cdef list _split(self, unicode string):
        """Find how to split a contiguous span of non-space characters into substrings.
        This method calls find_split repeatedly. Most languages will want to
@ -107,10 +118,10 @@ cdef class Language:
            string = string[split:]
        return substrings
-    cpdef int _split_one(self, unicode word):
+    cdef int _split_one(self, unicode word):
        return len(word)
-    def load_special_tokenization(self, token_rules):
+    def _load_special_tokenization(self, token_rules):
        '''Load special-case tokenization rules.
        Loads special-case tokenization rules into the Language.cache cache,
@ -132,14 +143,14 @@ cdef class Language:
 cdef class Lexicon:
    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
                  string_features, flag_features):
-        self.flag_features = flag_features
+        self._flag_features = flag_features
-        self.string_features = string_features
+        self._string_features = string_features
        self._dict = {}
        cdef Lexeme word
        for string in words:
            word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
                          case_stats.get(string, {}), tag_stats.get(string, {}),
-                          self.string_features, self.flag_features)
+                          self._string_features, self._flag_features)
            self._dict[string] = word
    cpdef Lexeme lookup(self, unicode string):
@ -155,7 +166,7 @@ cdef class Lexicon:
        if string in self._dict:
            return self._dict[string]
-        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features,
+        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
-                                  self.flag_features)
+                                  self._flag_features)
        self._dict[string] = word
        return word
`@ -4,4 +4,4 @@ cimport cython`


	`cdef class English(Language):`	`cdef class English(Language):`
	`cpdef int _split_one(self, unicode word)`	`cdef int _split_one(self, unicode word)`