add doctests for website 'api'-section

This commit is contained in:
Henning Peters 2015-09-28 14:22:13 +02:00
parent c3164f9cbe
commit f0360bf59d
5 changed files with 227 additions and 136 deletions

View File

@ -17,7 +17,7 @@ prefix = "test_"
for filename in os.listdir(src_dirname): for filename in os.listdir(src_dirname):
match = re.match(re.escape(prefix) + r"(.+)\.py", filename) match = re.match(re.escape(prefix) + r"(.+)\.py$", filename)
if not match: if not match:
continue continue
@ -25,27 +25,23 @@ for filename in os.listdir(src_dirname):
source = open(os.path.join(src_dirname, filename)).readlines() source = open(os.path.join(src_dirname, filename)).readlines()
tree = ast.parse("".join(source)) tree = ast.parse("".join(source))
for item in tree.body: for root in tree.body:
if isinstance(item, ast.FunctionDef) and item.name.startswith(prefix): if isinstance(root, ast.FunctionDef) and root.name.startswith(prefix):
# only ast.expr and ast.stmt have line numbers, see: # only ast.expr and ast.stmt have line numbers, see:
# https://docs.python.org/2/library/ast.html#ast.AST.lineno # https://docs.python.org/2/library/ast.html#ast.AST.lineno
line_numbers = [] line_numbers = []
def fill_line_numbers(node): for node in ast.walk(root):
for child in ast.iter_child_nodes(node): if hasattr(node, "lineno"):
if ((isinstance(child, ast.expr) or line_numbers.append(node.lineno)
isinstance(child, ast.stmt)) and
child.lineno > item.lineno):
line_numbers.append(child.lineno)
fill_line_numbers(child)
fill_line_numbers(item)
body = source[min(line_numbers)-1:max(line_numbers)] body = source[min(line_numbers)-1:max(line_numbers)]
while not body[0][0].isspace():
body = body[1:]
# make sure we are inside an indented function body # make sure we are inside an indented function body
assert all([re.match(r"\s", l[0]) for l in body]) assert all([l[0].isspace() for l in body])
offset = 0 offset = 0
for line in body: for line in body:
@ -63,7 +59,7 @@ for filename in os.listdir(src_dirname):
# make sure empty lines contain a newline # make sure empty lines contain a newline
assert all([l[-1] == "\n" for l in body]) assert all([l[-1] == "\n" for l in body])
code_filename = "%s.%s" % (name, item.name[len(prefix):]) code_filename = "%s.%s" % (name, root.name[len(prefix):])
with open(os.path.join(dst_dirname, code_filename), "w") as f: with open(os.path.join(dst_dirname, code_filename), "w") as f:
f.write(escape("".join(body))) f.write(escape("".join(body)))

View File

@ -76,15 +76,8 @@ mixin summary
block block
mixin en_example mixin en_example
pre.language-python pre.language-python: code
code include ../../code/api.example_war_and_peace
| from spacy.en import English
| from spacy._doc_examples import download_war_and_peace
|
| unprocessed_unicode = download_war_and_peace()
|
| nlp = English()
| doc = nlp(unprocessed_unicode)
mixin SeeAlso(name, link_target) mixin SeeAlso(name, link_target)
a(href=link_target) a(href=link_target)
@ -197,19 +190,19 @@ mixin Func(type1, type2)
pre.language-python pre.language-python
code code
| >>> nlp = spacy.en.English() | nlp = spacy.en.English()
p To keep the default components, but load data from a specified directory, use: p To keep the default components, but load data from a specified directory, use:
pre.language-python pre.language-python
code code
| >>> nlp = English(data_dir=u'path/to/data_directory') | nlp = English(data_dir=u'path/to/data_directory')
p To disable (and avoid loading) parts of the processing pipeline: p To disable (and avoid loading) parts of the processing pipeline:
pre.language-python pre.language-python
code code
| >>> nlp = English(parser=False, tagger=False, entity=False) | nlp = English(parser=False, tagger=False, entity=False)
+params +params
+param("data_dir") +param("data_dir")
@ -249,17 +242,8 @@ mixin Func(type1, type2)
+param("entity", types.bool) +param("entity", types.bool)
| Whether to apply the named entity recognizer. | Whether to apply the named entity recognizer.
pre.language-python pre.language-python: code
code include ../../code/api.main_entry_point
| from spacy.en import English
| nlp = English()
| doc = nlp(u'Some text.) # Applies tagger, parser, entity
| doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
| doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
| doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
| doc = nlp(u'') # Zero-length tokens, not an error
| # doc = nlp(b'Some text') <-- Error: need unicode
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+declare_class("Doc", "doc") +declare_class("Doc", "doc")
@ -297,41 +281,19 @@ mixin Func(type1, type2)
+attribute("sents", types.generator)(open=true) +attribute("sents", types.generator)(open=true)
| Yields sentence #[code Span] objects. Iterate over the span to get individual #[code Token] objects. Sentence spans have no label. | Yields sentence #[code Span] objects. Iterate over the span to get individual #[code Token] objects. Sentence spans have no label.
pre.language-python pre.language-python: code
code include ../../code/api.sentence_spans
| >>> from spacy.en import English
| >>> nlp = English()
| >>> doc = nlp(u'This is a sentence. Here's another...')
| >>> for sentence in doc.sents:
| ... sentence.root.orth_
| is
| 's
+attribute("ents", types.generator)(open=true) +attribute("ents", types.generator)(open=true)
| Yields named-entity #[code Span] objects. Iterate over the span to get individual #[code Token] objects, or access the label: | Yields named-entity #[code Span] objects. Iterate over the span to get individual #[code Token] objects, or access the label:
pre.language-python pre.language-python: code
code include ../../code/api.entity_spans
| >>> from spacy.en import English
| >>> nlp = English()
| >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
| >>> ents = list(tokens.ents)
| >>> ents[0].label, ents[0].label_, ents[0].orth_, ents[0].string
| (112504, 'PERSON', 'Best', ents[0].string)
+attribute("noun_chunks", types.generator)(open=true) +attribute("noun_chunks", types.generator)(open=true)
| Yields base noun-phrase #[code Span ] objects. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it &ndash; so no NP-level coordination, no prepositional phrases, and no relative clauses. For example: | Yields base noun-phrase #[code Span ] objects. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it &ndash; so no NP-level coordination, no prepositional phrases, and no relative clauses. For example:
pre.language-python pre.language-python: code
code include ../../code/api.noun_chunk_spans
| >>> from spacy.en import English
| >>> nlp = English()
| >>> doc = nlp('The sentence in this example has three noun chunks.')
| >>> for chunk in doc.noun_chunks:
| ... print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
| NP The sentence <-- has
| NP this example <-- in
| NP three noun chunks <-- has
details details
summary: h4 Export/Import summary: h4 Export/Import
@ -346,18 +308,8 @@ mixin Func(type1, type2)
+method("count_by", "attr_id")(open=true) +method("count_by", "attr_id")(open=true)
| Produce a dict of #[code {attribute (int): count (ints)}] frequencies, keyed by the values of the given attribute ID. | Produce a dict of #[code {attribute (int): count (ints)}] frequencies, keyed by the values of the given attribute ID.
pre.language-python pre.language-python: code
code include ../../code/api.count_by
| >>> from spacy.en import English, attrs
| >>> nlp = English()
| >>> tokens = nlp(u'apple apple orange banana')
| >>> tokens.count_by(attrs.ORTH)
| {12800L: 1, 11880L: 2, 7561L: 1}
| >>> tokens.to_array([attrs.ORTH])
| array([[11880],
| [11880],
| [7561],
| [12800]])
+method("from_array", "attrs, array")(open=true) +method("from_array", "attrs, array")(open=true)
Write to a #[code Doc] object, from an M*N array of attributes. Write to a #[code Doc] object, from an M*N array of attributes.
@ -371,10 +323,8 @@ mixin Func(type1, type2)
+method("read_bytes")(open=true) +method("read_bytes")(open=true)
| A staticmethod, used to read serialized #[code Doc] objects from a file. | A staticmethod, used to read serialized #[code Doc] objects from a file.
| For example: | For example:
pre.language-python pre.language-python: code
code include ../../code/api.read_bytes
| for byte_string in Doc.read_bytes(open(location_of_bytes)):
| doc = Doc(nlp.vocab).from_bytes(byte_string)
+declare_class("Token", "token") +declare_class("Token", "token")
p A Token represents a single word, punctuation or significant whitespace symbol. Integer IDs are provided for all string features. The (unicode) string is provided by an attribute of the same name followed by an underscore, e.g. #[code token.orth] is an integer ID, #[code token.orth_] is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed. p A Token represents a single word, punctuation or significant whitespace symbol. Integer IDs are provided for all string features. The (unicode) string is provided by an attribute of the same name followed by an underscore, e.g. #[code token.orth] is an integer ID, #[code token.orth_] is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed.
@ -476,11 +426,8 @@ mixin Func(type1, type2)
+Define("token = span[i]") +Define("token = span[i]")
| Get the #[code Token] object at position #[em i], where #[em i] is an offset within the #[code Span], not the document. That is: | Get the #[code Token] object at position #[em i], where #[em i] is an offset within the #[code Span], not the document. That is:
pre.language-python pre.language-python: code
code include ../../code/api.token_span
| span = doc[4:6]
| token = span[0]
| assert token.i == 4
ul ul
+Define("for token in span") +Define("for token in span")
@ -503,53 +450,34 @@ mixin Func(type1, type2)
+attribute("root")(open=true) +attribute("root")(open=true)
| The first ancestor of the first word of the span that has its head outside the span. For example: | The first ancestor of the first word of the span that has its head outside the span. For example:
pre.language-python pre.language-python: code
code include ../../code/api.example_i_like_new_york1
| >>> toks = nlp(u'I like New York in Autumn.')
p Let's name the indices --- easier than writing #[code toks[4]] etc. p Let's name the indices --- easier than writing #[code toks[4]] etc.
pre.language-python: code
pre.language-python include ../../code/api.example_i_like_new_york2
code
| >>> i, like, new, york, in_, autumn, dot = range(len(toks))
p The head of #[em new] is #[em York], and the head of #[em York] is #[em like] p The head of #[em new] is #[em York], and the head of #[em York] is #[em like]
pre.language-python pre.language-python: code
code include ../../code/api.example_i_like_new_york3
| >>> toks[new].head.orth_
| 'York'
| >>> toks[york].head.orth_
| 'like'
p Create a span for "New York". Its root is "York". p Create a span for "New York". Its root is "York".
pre.language-python pre.language-python: code
code include ../../code/api.example_i_like_new_york4
| >>> new_york = toks[new:york+1]
| >>> new_york.root.orth_
| 'York'
p When there are multiple words with external dependencies, we take the first: p When there are multiple words with external dependencies, we take the first:
pre.language-python: code
pre.language-python include ../../code/api.example_i_like_new_york5
code
| >>> toks[autumn].head.orth_, toks[dot].head.orth_
| ('in', like')
| >>> autumn_dot = toks[autumn:]
| >>> autumn_dot.root.orth_
| 'Autumn'
+attribute("lefts")(open=true) +attribute("lefts")(open=true)
| Tokens that are to the left of the span, whose head is within the span, i.e. | Tokens that are to the left of the span, whose head is within the span, i.e.
code.language-python pre.language-python: code
| lefts = [span.doc[i] for i in range(0, span.start) include ../../code/api.navigating_the_parse_tree_lefts
| if span.doc[i].head in span]
+attribute("rights")(open=true) +attribute("rights")(open=true)
| Tokens that are to the right of the span, whose head is within the span, i.e. | Tokens that are to the right of the span, whose head is within the span, i.e.
code.language-python pre.language-python: code
| rights = [span.doc[i] for i in range(span.end, len(span.doc)) include ../../code/api.navigating_the_parse_tree_rights
| if span.doc[i].head in span]
+attribute("subtree")(open=true) +attribute("subtree")(open=true)
| Tokens in the range #[code (start, end+1)], where #[code start] is the index of the leftmost word descended from a token in the span, and #[code end] is the index of the rightmost token descended from a token in the span. | Tokens in the range #[code (start, end+1)], where #[code start] is the index of the leftmost word descended from a token in the span, and #[code end] is the index of the rightmost token descended from a token in the span.
@ -669,10 +597,8 @@ mixin Func(type1, type2)
+Define("for string in string_store")(open=true) +Define("for string in string_store")(open=true)
| Iterate over strings in the string store, in order, such that the #[em i]th string in the sequence has the ID #[em i]: | Iterate over strings in the string store, in order, such that the #[em i]th string in the sequence has the ID #[em i]:
pre.language-python pre.language-python: code
code include ../../code/api.string_store
| for i, string in enumerate(string_store):
| assert i == string_store[string]
+init +init
p #[code StringStore.__init__] takes no arguments, so a new instance can be constructed as follows: p #[code StringStore.__init__] takes no arguments, so a new instance can be constructed as follows:

13
website/tests/conftest.py Normal file
View File

@ -0,0 +1,13 @@
from __future__ import unicode_literals
import pytest
@pytest.fixture(scope='session')
def nlp():
from spacy.en import English
return English()
@pytest.fixture()
def doc(nlp):
return nlp('Hello, world. Here are two sentences.')

163
website/tests/test_api.py Normal file
View File

@ -0,0 +1,163 @@
from __future__ import unicode_literals
import pytest
@pytest.mark.xfail
def test_example_war_and_peace(nlp):
# from spacy.en import English
from spacy._doc_examples import download_war_and_peace
unprocessed_unicode = download_war_and_peace()
# nlp = English()
# TODO: ImportError: No module named _doc_examples
doc = nlp(unprocessed_unicode)
def test_main_entry_point(nlp):
# from spacy.en import English
# nlp = English()
doc = nlp('Some text.') # Applies tagger, parser, entity
doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
doc = nlp('') # Zero-length tokens, not an error
# doc = nlp(b'Some text') <-- Error: need unicode
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
def test_sentence_spans(nlp):
# from spacy.en import English
# nlp = English()
doc = nlp("This is a sentence. Here's another...")
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
@pytest.mark.xfail
def test_entity_spans(nlp):
# from spacy.en import English
# nlp = English()
tokens = nlp('Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 112504
assert ents[0].label_ == 'PERSON'
assert ents[0].orth_ == 'Best'
assert ents[0].string == ents[0].string
def test_noun_chunk_spans(nlp):
# from spacy.en import English
# nlp = English()
doc = nlp('The sentence in this example has three noun chunks.')
for chunk in doc.noun_chunks:
print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
# NP The sentence <-- has
# NP this example <-- in
# NP three noun chunks <-- has
@pytest.mark.xfail
def test_count_by(nlp):
# from spacy.en import English, attrs
# nlp = English()
from spacy.en import attrs
tokens = nlp('apple apple orange banana')
assert tokens.count_by(attrs.ORTH) == {12800L: 1,
11880L: 2,
7561L: 1}
assert tokens.to_array([attrs.ORTH]) == array([[11880],
[11880],
[7561],
[12800]])
@pytest.mark.xfail
def test_read_bytes():
# TODO: missing imports
for byte_string in Doc.read_bytes(open('path/to/data_directory')):
doc = Doc(nlp.vocab).from_bytes(byte_string)
def test_token_span(doc):
span = doc[4:6]
token = span[0]
assert token.i == 4
def test_example_i_like_new_york1(nlp):
toks = nlp('I like New York in Autumn.')
@pytest.fixture
def toks(nlp):
return nlp('I like New York in Autumn.')
def test_example_i_like_new_york2(toks):
i, like, new, york, in_, autumn, dot = range(len(toks))
@pytest.fixture
def tok(toks, tok):
i, like, new, york, in_, autumn, dot = range(len(toks))
return locals()[tok]
@pytest.fixture
def new(toks):
return tok(toks, "new")
@pytest.fixture
def york(toks):
return tok(toks, "york")
@pytest.fixture
def autumn(toks):
return tok(toks, "autumn")
@pytest.fixture
def dot(toks):
return tok(toks, "dot")
def test_example_i_like_new_york3(toks, new, york):
assert toks[new].head.orth_ == 'York'
assert toks[york].head.orth_ == 'like'
def test_example_i_like_new_york4(toks, new, york):
new_york = toks[new:york+1]
assert new_york.root.orth_ == 'York'
@pytest.mark.xfail
def test_example_i_like_new_york5(toks, autumn, dot):
assert toks[autumn].head.orth_ == 'in'
assert toks[dot].head.orth_ == 'like'
# TODO: TypeError: readonly attribute
autumn_dot = toks[autumn:]
assert autumn_dot.root.orth_ == 'Autumn'
@pytest.mark.xfail
def test_navigating_the_parse_tree_lefts(doc):
# TODO: where does the span object come from?
lefts = [span.doc[i] for i in range(0, span.start)
if span.doc[i].head in span]
@pytest.mark.xfail
def test_navigating_the_parse_tree_rights(doc):
# TODO: where does the span object come from?
rights = [span.doc[i] for i in range(span.end, len(span.doc))
if span.doc[i].head in span]
def test_string_store(doc):
string_store = doc.vocab.strings
for i, string in enumerate(string_store):
assert i == string_store[string]

View File

@ -2,17 +2,6 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.fixture(scope="session")
def nlp():
from spacy.en import English
return English()
@pytest.fixture()
def doc(nlp):
return nlp('Hello, world. Here are two sentences.')
@pytest.fixture() @pytest.fixture()
def token(doc): def token(doc):
return doc[0] return doc[0]
@ -31,6 +20,7 @@ def test_get_tokens_and_sentences(doc):
assert sentence.text == 'Hello, world.' assert sentence.text == 'Hello, world.'
@pytest.mark.xfail
def test_use_integer_ids_for_any_strings(nlp, token): def test_use_integer_ids_for_any_strings(nlp, token):
hello_id = nlp.vocab.strings['Hello'] hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id] hello_str = nlp.vocab.strings[hello_id]
@ -65,6 +55,7 @@ def test_export_to_numpy_arrays(nlp, doc):
assert list(doc_array[:, 1]) == [t.like_url for t in doc] assert list(doc_array[:, 1]) == [t.like_url for t in doc]
@pytest.mark.xfail
def test_word_vectors(nlp): def test_word_vectors(nlp):
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
@ -76,6 +67,7 @@ def test_word_vectors(nlp):
assert apples.similarity(oranges) > boots.similarity(hippos) assert apples.similarity(oranges) > boots.similarity(hippos)
@pytest.mark.xfail
def test_part_of_speech_tags(nlp): def test_part_of_speech_tags(nlp):
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV
@ -151,6 +143,7 @@ def test_calculate_inline_mark_up_on_original_string():
return string return string
@pytest.mark.xfail
def test_efficient_binary_serialization(doc): def test_efficient_binary_serialization(doc):
byte_string = doc.as_bytes() byte_string = doc.as_bytes()
open('/tmp/moby_dick.bin', 'wb').write(byte_string) open('/tmp/moby_dick.bin', 'wb').write(byte_string)