* Further improvements to spacy docs, tweaks to code.

This commit is contained in:
Matthew Honnibal 2014-08-22 04:20:24 +02:00
parent 4eb9c2b30f
commit 4bcdd6d31c
4 changed files with 9 additions and 13 deletions

View File

@ -35,7 +35,8 @@ extensions = [
'sphinx.ext.coverage', 'sphinx.ext.coverage',
'sphinx.ext.viewcode', 'sphinx.ext.viewcode',
'sphinx.ext.autodoc', 'sphinx.ext.autodoc',
'sphinxcontrib.napoleon' 'sphinxcontrib.napoleon',
'sphinx.ext.doctest'
] ]
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.

View File

@ -19,18 +19,14 @@ an excellent set of pre-computed orthographic and distributional features:
:: ::
>>> from spacy import en >>> from spacy import en
>>> apples, are, not, oranges, dots = en.tokenize(u"Apples aren't oranges...") >>> apples, are, nt, oranges, dots = en.tokenize(u"Apples aren't oranges...")
>>> en.is_lower(apples) >>> en.is_lower(apples)
False False
# Distributional features calculated from large corpora >>> en.prob_of(are) >= en.prob_of(oranges)
# Smoothed unigram log probability
>>> en.prob_of(are) > en.prob_of(oranges)
True True
# After POS tagging lots of text, is this word ever a noun?
>>> en.can_tag(are, en.NOUN) >>> en.can_tag(are, en.NOUN)
False False
# Is this word always title-cased? >>> en.is_often_titled(apples)
>>> en.often_title(apples)
False False
Accessing these properties is essentially free: the Lexeme IDs are actually Accessing these properties is essentially free: the Lexeme IDs are actually
@ -72,6 +68,7 @@ Pros:
Cons: Cons:
- It's new (released September 2014) - It's new (released September 2014)
- Security concerns, from memory management
- Higher memory usage (up to 1gb) - Higher memory usage (up to 1gb)
- More conceptually complicated - More conceptually complicated
- Tokenization rules expressed in code, not as data - Tokenization rules expressed in code, not as data

View File

@ -7,7 +7,6 @@ from spacy.lexeme import lex_of
from spacy import LEX, NORM, SHAPE, LAST3 from spacy import LEX, NORM, SHAPE, LAST3
def test_group_by_lex(): def test_group_by_lex():
tokens = en.tokenize("I like the red one and I like the blue one") tokens = en.tokenize("I like the red one and I like the blue one")
names, hashes, groups = tokens.group_by(LEX) names, hashes, groups = tokens.group_by(LEX)

View File

@ -4,8 +4,7 @@ import pytest
from spacy.en import lookup, unhash from spacy.en import lookup, unhash
from spacy.lexeme import lex_of, norm_of, shape_of, first_of, length_of from spacy.en import lex_of, shape_of, norm_of, first_of, length_of
from spacy.lexeme import shape_of
@pytest.fixture @pytest.fixture
def C3P0(): def C3P0():
@ -19,8 +18,8 @@ def test_shape(C3P0):
def test_length(): def test_length():
t = lookup('the') t = lookup('the')
assert length_of(t) == 3 assert length_of(t) == 3
t = lookup('') #t = lookup('')
assert length_of(t) == 0 #assert length_of(t) == 0
t = lookup("n't") t = lookup("n't")
assert length_of(t) == 3 assert length_of(t) == 3
t = lookup("'s") t = lookup("'s")