Merge pull request #111 from henningpeters/master

add doctests for website 'api'-section
2025-04-14 22:24:15 +03:00 · 2015-09-28 22:40:48 +10:00 · 2015-09-28 22:40:48 +10:00 · 37729b9592
commit 37729b9592
parent c03e74272b b364be73dc
6 changed files with 225 additions and 138 deletions
--- a/tests/website/conftest.py
+++ b/tests/website/conftest.py
@ -0,0 +1,13 @@
+from __future__ import unicode_literals
+import pytest
+
+
+@pytest.fixture(scope='session')
+def nlp():
+    from spacy.en import English
+    return English()
+
+
+@pytest.fixture()
+def doc(nlp):
+    return nlp('Hello, world. Here are two sentences.')
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@ -0,0 +1,163 @@
+from __future__ import unicode_literals
+import pytest
+
+
+@pytest.mark.xfail
+def test_example_war_and_peace(nlp):
+    # from spacy.en import English
+    from spacy._doc_examples import download_war_and_peace
+
+    unprocessed_unicode = download_war_and_peace()
+
+    # nlp = English()
+    # TODO: ImportError: No module named _doc_examples
+    doc = nlp(unprocessed_unicode)
+
+
+def test_main_entry_point(nlp):
+    # from spacy.en import English
+    # nlp = English()
+    doc = nlp('Some text.') # Applies tagger, parser, entity
+    doc = nlp('Some text.', parse=False) # Applies tagger and entity, not parser
+    doc = nlp('Some text.', entity=False) # Applies tagger and parser, not entity
+    doc = nlp('Some text.', tag=False) # Does not apply tagger, entity or parser
+    doc = nlp('') # Zero-length tokens, not an error
+    # doc = nlp(b'Some text') <-- Error: need unicode
+    doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+
+
+def test_sentence_spans(nlp):
+    # from spacy.en import English
+    # nlp = English()
+    doc = nlp("This is a sentence. Here's another...")
+    assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
+
+
+@pytest.mark.xfail
+def test_entity_spans(nlp):
+    # from spacy.en import English
+    # nlp = English()
+    tokens = nlp('Mr. Best flew to New York on Saturday morning.')
+    ents = list(tokens.ents)
+    assert ents[0].label == 112504
+    assert ents[0].label_ == 'PERSON'
+    assert ents[0].orth_ == 'Best'
+    assert ents[0].string == ents[0].string
+
+
+def test_noun_chunk_spans(nlp):
+    # from spacy.en import English
+    # nlp = English()
+    doc = nlp('The sentence in this example has three noun chunks.')
+    for chunk in doc.noun_chunks:
+        print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
+
+    # NP The sentence <-- has
+    # NP this example <-- in
+    # NP three noun chunks <-- has
+
+
+@pytest.mark.xfail
+def test_count_by(nlp):
+    # from spacy.en import English, attrs
+    # nlp = English()
+    from spacy.en import attrs
+    tokens = nlp('apple apple orange banana')
+    assert tokens.count_by(attrs.ORTH) == {12800L: 1,
+                                           11880L: 2,
+                                           7561L: 1}
+    assert tokens.to_array([attrs.ORTH]) == array([[11880],
+                                                   [11880],
+                                                   [7561],
+                                                   [12800]])
+
+
+@pytest.mark.xfail
+def test_read_bytes():
+    # TODO: missing imports
+    for byte_string in Doc.read_bytes(open('path/to/data_directory')):
+        doc = Doc(nlp.vocab).from_bytes(byte_string)
+
+
+def test_token_span(doc):
+    span = doc[4:6]
+    token = span[0]
+    assert token.i == 4
+
+
+def test_example_i_like_new_york1(nlp):
+    toks = nlp('I like New York in Autumn.')
+
+
+@pytest.fixture
+def toks(nlp):
+    return nlp('I like New York in Autumn.')
+
+
+def test_example_i_like_new_york2(toks):
+    i, like, new, york, in_, autumn, dot = range(len(toks))
+
+
+@pytest.fixture
+def tok(toks, tok):
+    i, like, new, york, in_, autumn, dot = range(len(toks))
+    return locals()[tok]
+
+
+@pytest.fixture
+def new(toks):
+    return tok(toks, "new")
+
+
+@pytest.fixture
+def york(toks):
+    return tok(toks, "york")
+
+
+@pytest.fixture
+def autumn(toks):
+    return tok(toks, "autumn")
+
+
+@pytest.fixture
+def dot(toks):
+    return tok(toks, "dot")
+
+
+def test_example_i_like_new_york3(toks, new, york):
+    assert toks[new].head.orth_ == 'York'
+    assert toks[york].head.orth_ == 'like'
+
+
+def test_example_i_like_new_york4(toks, new, york):
+    new_york = toks[new:york+1]
+    assert new_york.root.orth_ == 'York'
+
+
+@pytest.mark.xfail
+def test_example_i_like_new_york5(toks, autumn, dot):
+    assert toks[autumn].head.orth_ == 'in'
+    assert toks[dot].head.orth_ == 'like'
+    # TODO: TypeError: readonly attribute
+    autumn_dot = toks[autumn:]
+    assert autumn_dot.root.orth_ == 'Autumn'
+
+
+@pytest.mark.xfail
+def test_navigating_the_parse_tree_lefts(doc):
+    # TODO: where does the span object come from?
+    lefts = [span.doc[i] for i in range(0, span.start)
+             if span.doc[i].head in span]
+
+
+@pytest.mark.xfail
+def test_navigating_the_parse_tree_rights(doc):
+    # TODO: where does the span object come from?
+    rights = [span.doc[i] for i in range(span.end, len(span.doc))
+              if span.doc[i].head in span]
+
+
+def test_string_store(doc):
+    string_store = doc.vocab.strings
+    for i, string in enumerate(string_store):
+        assert i == string_store[string]
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@ -3,17 +3,6 @@ import pytest
 import spacy.en


-@pytest.fixture(scope="session")
-def nlp():
-    from spacy.en import English
-    return English()
-
-
-@pytest.fixture()
-def doc(nlp):
-    return nlp('Hello, world. Here are two sentences.')
-
-
@pytest.fixture()
 def token(doc):
    return doc[0]
--- a/website/Makefile
+++ b/website/Makefile
@ -1,8 +1,8 @@
 all: src/code site

-src/code: tests/test_*.py
+src/code:
 	mkdir -p src/code/
-	./create_code_samples tests/ src/code/
+	./create_code_samples ../tests/website/ src/code/

 site: site/index.html site/blog/ site/docs/ site/license/ site/blog/introducing-spacy/ site/blog/parsing-english-in-python/ site/blog/part-of-speech-POS-tagger-in-python/ site/tutorials/twitter-filter/ site/tutorials/syntax-search/ site/tutorials/mark-adverbs/ site/blog/writing-c-in-cython/ site/blog/how-spacy-works/

--- a/website/create_code_samples
+++ b/website/create_code_samples
@ -17,7 +17,7 @@ prefix = "test_"


 for filename in os.listdir(src_dirname):
-    match = re.match(re.escape(prefix) + r"(.+)\.py", filename)
+    match = re.match(re.escape(prefix) + r"(.+)\.py$", filename)
    if not match:
        continue

@ -25,27 +25,23 @@ for filename in os.listdir(src_dirname):
    source = open(os.path.join(src_dirname, filename)).readlines()
    tree = ast.parse("".join(source))

-    for item in tree.body:
-        if isinstance(item, ast.FunctionDef) and item.name.startswith(prefix):
+    for root in tree.body:
+        if isinstance(root, ast.FunctionDef) and root.name.startswith(prefix):

            # only ast.expr and ast.stmt have line numbers, see:
            # https://docs.python.org/2/library/ast.html#ast.AST.lineno
            line_numbers = []

-            def fill_line_numbers(node):
-                for child in ast.iter_child_nodes(node):
-                    if ((isinstance(child, ast.expr) or
-                         isinstance(child, ast.stmt)) and
-                        child.lineno > item.lineno):
+            for node in ast.walk(root):
+                if hasattr(node, "lineno"):
+                    line_numbers.append(node.lineno)

-                        line_numbers.append(child.lineno)
-                    fill_line_numbers(child)
-
-            fill_line_numbers(item)
            body = source[min(line_numbers)-1:max(line_numbers)]
+            while not body[0][0].isspace():
+                body = body[1:]

            # make sure we are inside an indented function body
-            assert all([re.match(r"\s", l[0]) for l in body])
+            assert all([l[0].isspace() for l in body])

            offset = 0
            for line in body:
@ -63,7 +59,7 @@ for filename in os.listdir(src_dirname):
            # make sure empty lines contain a newline
            assert all([l[-1] == "\n" for l in body])

-            code_filename = "%s.%s" % (name, item.name[len(prefix):])
+            code_filename = "%s.%s" % (name, root.name[len(prefix):])

            with open(os.path.join(dst_dirname, code_filename), "w") as f:
                f.write(escape("".join(body)))
--- a/website/src/jade/docs/_api.jade
+++ b/website/src/jade/docs/_api.jade
@ -76,15 +76,8 @@ mixin summary
    block

 mixin en_example
-    pre.language-python
-        code
-            | from spacy.en import English
-            | from spacy._doc_examples import download_war_and_peace
-            | 
-            | unprocessed_unicode = download_war_and_peace()
-            | 
-            | nlp = English()
-            | doc = nlp(unprocessed_unicode)
+    pre.language-python: code
+        include ../../code/api.example_war_and_peace

 mixin SeeAlso(name, link_target)
    a(href=link_target)
@ -197,19 +190,19 @@ mixin Func(type1, type2)

            pre.language-python
                code
-                    | >>> nlp = spacy.en.English()
+                    | nlp = spacy.en.English()
                
            p To keep the default components, but load data from a specified directory, use:

            pre.language-python
                code
-                    | >>> nlp = English(data_dir=u'path/to/data_directory')
+                    | nlp = English(data_dir=u'path/to/data_directory')

            p To disable (and avoid loading) parts of the processing pipeline:

            pre.language-python
                code
-                    | >>> nlp = English(parser=False, tagger=False, entity=False)
+                    | nlp = English(parser=False, tagger=False, entity=False)
            
            +params
                +param("data_dir") 
@ -249,17 +242,8 @@ mixin Func(type1, type2)
            +param("entity", types.bool)
                | Whether to apply the named entity recognizer.

-        pre.language-python
-            code
-                | from spacy.en import English
-                | nlp = English()
-                | doc = nlp(u'Some text.) # Applies tagger, parser, entity
-                | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
-                | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
-                | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
-                | doc = nlp(u'') # Zero-length tokens, not an error
-                | # doc = nlp(b'Some text') <-- Error: need unicode
-                | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+        pre.language-python: code
+            include ../../code/api.main_entry_point


 +declare_class("Doc", "doc")
@ -297,41 +281,19 @@ mixin Func(type1, type2)
        
        +attribute("sents", types.generator)(open=true)
            |  Yields sentence #[code Span] objects.  Iterate over the span to get individual #[code Token] objects.  Sentence spans have no label.
-            pre.language-python
-                code
-                    | >>> from spacy.en import English
-                    | >>> nlp = English()
-                    | >>> doc = nlp(u'This is a sentence. Here's another...')
-                    | >>> for sentence in doc.sents:
-                    | ...     sentence.root.orth_
-                    | is
-                    | 's
- 
-    
+            pre.language-python: code
+                include ../../code/api.sentence_spans
+
        +attribute("ents", types.generator)(open=true)
            |  Yields named-entity #[code Span] objects.  Iterate over the span to get individual #[code Token] objects, or access the label:
-            pre.language-python
-                code
-                    | >>> from spacy.en import English
-                    | >>> nlp = English()
-                    | >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
-                    | >>> ents = list(tokens.ents)
-                    | >>> ents[0].label, ents[0].label_, ents[0].orth_, ents[0].string
-                    | (112504, 'PERSON', 'Best', ents[0].string) 
+            pre.language-python: code
+                include ../../code/api.entity_spans
 
        +attribute("noun_chunks", types.generator)(open=true)
            |  Yields base noun-phrase #[code Span ] objects.  A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it &ndash; so no NP-level coordination, no prepositional phrases, and no relative clauses.  For example:
-            pre.language-python
-                code
-                    | >>> from spacy.en import English
-                    | >>> nlp = English()
-                    | >>> doc = nlp('The sentence in this example has three noun chunks.')
-                    | >>> for chunk in doc.noun_chunks:
-                    | ...     print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
-                    | NP The sentence <-- has
-                    | NP this example <-- in
-                    | NP three noun chunks <-- has
-    
+            pre.language-python: code
+                include ../../code/api.noun_chunk_spans
+
    details
        summary: h4 Export/Import
        
@ -346,18 +308,8 @@ mixin Func(type1, type2)
        +method("count_by", "attr_id")(open=true)
            | Produce a dict of #[code {attribute (int): count (ints)}] frequencies, keyed by the values of the given attribute ID.
        
-            pre.language-python
-                code
-                    | >>> from spacy.en import English, attrs
-                    | >>> nlp = English()
-                    | >>> tokens = nlp(u'apple apple orange banana')
-                    | >>> tokens.count_by(attrs.ORTH)
-                    | {12800L: 1, 11880L: 2, 7561L: 1}
-                    | >>> tokens.to_array([attrs.ORTH])
-                    | array([[11880],
-                    |         [11880],
-                    |         [7561],
-                    |         [12800]])
+            pre.language-python: code
+                include ../../code/api.count_by

        +method("from_array", "attrs, array")(open=true)
            Write to a #[code Doc] object, from an M*N array of attributes.
@ -371,10 +323,8 @@ mixin Func(type1, type2)
        +method("read_bytes")(open=true)
            | A staticmethod, used to read serialized #[code Doc] objects from a file.
            | For example:
-            pre.language-python
-                code
-                    | for byte_string in Doc.read_bytes(open(location_of_bytes)):
-                    |     doc = Doc(nlp.vocab).from_bytes(byte_string)
+            pre.language-python: code
+                include ../../code/api.read_bytes

 +declare_class("Token", "token")
    p A Token represents a single word, punctuation or significant whitespace symbol. Integer IDs are provided for all string features. The (unicode) string is provided by an attribute of the same name followed by an underscore, e.g. #[code token.orth] is an integer ID, #[code token.orth_] is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed.
@ -476,11 +426,8 @@ mixin Func(type1, type2)

    +Define("token = span[i]")
        | Get the #[code Token] object at position #[em i], where #[em i] is an offset within the #[code Span], not the document.  That is:
-        pre.language-python
-            code
-                | span = doc[4:6]
-                | token = span[0]
-                | assert token.i == 4
+        pre.language-python: code
+            include ../../code/api.token_span

    ul
        +Define("for token in span")
@ -503,53 +450,34 @@ mixin Func(type1, type2)

        +attribute("root")(open=true)
            | The first ancestor of the first word of the span that has its head outside the span. For example:
-            pre.language-python
-                code
-                    | >>> toks = nlp(u'I like New York in Autumn.')
+            pre.language-python: code
+                include ../../code/api.example_i_like_new_york1

            p Let's name the indices --- easier than writing #[code toks[4]] etc.
-
-            pre.language-python
-                code
-                    | >>> i, like, new, york, in_, autumn, dot = range(len(toks)) 
+            pre.language-python: code
+                include ../../code/api.example_i_like_new_york2

            p The head of #[em new] is #[em York], and the head of #[em York] is #[em like]
-            pre.language-python
-                code
-                    | >>> toks[new].head.orth_
-                    | 'York'
-                    | >>> toks[york].head.orth_
-                    | 'like'
+            pre.language-python: code
+                include ../../code/api.example_i_like_new_york3

            p Create a span for "New York". Its root is "York".
-            pre.language-python
-                code
-                    | >>> new_york = toks[new:york+1]
-                    | >>> new_york.root.orth_
-                    | 'York'
+            pre.language-python: code
+                include ../../code/api.example_i_like_new_york4

            p When there are multiple words with external dependencies, we take the first:
-
-            pre.language-python
-                code
-                    | >>> toks[autumn].head.orth_, toks[dot].head.orth_
-                    | ('in', like')
-                    | >>> autumn_dot = toks[autumn:]
-                    | >>> autumn_dot.root.orth_
-                    | 'Autumn'
+            pre.language-python: code
+                include ../../code/api.example_i_like_new_york5

        +attribute("lefts")(open=true)
            | Tokens that are to the left of the span, whose head is within the span, i.e. 
-            code.language-python
-                | lefts = [span.doc[i] for i in range(0, span.start)
-                |          if span.doc[i].head in span]
+            pre.language-python: code
+                include ../../code/api.navigating_the_parse_tree_lefts

        +attribute("rights")(open=true)
            | Tokens that are to the right of the span, whose head is within the span, i.e.
-            code.language-python
-                | rights = [span.doc[i] for i in range(span.end, len(span.doc))
-                |           if span.doc[i].head in span]
-
+            pre.language-python: code
+                include ../../code/api.navigating_the_parse_tree_rights

        +attribute("subtree")(open=true)
            | Tokens in the range #[code (start, end+1)], where #[code start] is the index of the leftmost word descended from a token in the span, and #[code end] is the index of the rightmost token descended from a token in the span.
@ -669,10 +597,8 @@ mixin Func(type1, type2)

        +Define("for string in string_store")(open=true)
            | Iterate over strings in the string store, in order, such that the #[em i]th string in the sequence has the ID #[em i]:
-            pre.language-python
-                code
-                    | for i, string in enumerate(string_store):
-                    |     assert i == string_store[string]
+            pre.language-python: code
+                include ../../code/api.string_store

    +init
        p #[code StringStore.__init__] takes no arguments, so a new instance can be constructed as follows: