spaCy/tests/test_onto_sgml_extract.py

import pytest
import os
from os import path

from spacy.munge.read_ontonotes import sgml_extract


text_data = open(path.join(path.dirname(__file__), 'web_sample1.sgm')).read()


def test_example_extract():
    article = sgml_extract(text_data)
    assert article['docid'] == 'blogspot.com_alaindewitt_20060924104100_ENG_20060924_104100'
    assert article['doctype'] == 'BLOG TEXT'
    assert article['datetime'] == '2006-09-24T10:41:00'
    assert article['headline'].strip() == 'Devastating Critique of the Arab World by One of Its Own'
    assert article['poster'] == 'Alain DeWitt'
    assert article['postdate'] == '2006-09-24T10:41:00'
    assert article['text'].startswith('Thanks again to my fri'), article['text'][:10]
    assert article['text'].endswith(' tide will turn."'), article['text'][-10:]
    assert '<' not in article['text'], article['text'][:10]


def test_directory():
    context_dir = '/usr/local/data/OntoNotes5/data/english/metadata/context/wb/sel'

    for fn in os.listdir(context_dir):
        with open(path.join(context_dir, fn)) as file_:
            text = file_.read()
        article = sgml_extract(text)
* Add tests for ontonotes sgml extraction 2015-05-24 22:52:12 +03:00			`import pytest`
			`import os`
			`from os import path`

			`from spacy.munge.read_ontonotes import sgml_extract`


			`text_data = open(path.join(path.dirname(__file__), 'web_sample1.sgm')).read()`


			`def test_example_extract():`
			`article = sgml_extract(text_data)`
			`assert article['docid'] == 'blogspot.com_alaindewitt_20060924104100_ENG_20060924_104100'`
			`assert article['doctype'] == 'BLOG TEXT'`
			`assert article['datetime'] == '2006-09-24T10:41:00'`
			`assert article['headline'].strip() == 'Devastating Critique of the Arab World by One of Its Own'`
			`assert article['poster'] == 'Alain DeWitt'`
			`assert article['postdate'] == '2006-09-24T10:41:00'`
			`assert article['text'].startswith('Thanks again to my fri'), article['text'][:10]`
			`assert article['text'].endswith(' tide will turn."'), article['text'][-10:]`
			`assert '<' not in article['text'], article['text'][:10]`


			`def test_directory():`
			`context_dir = '/usr/local/data/OntoNotes5/data/english/metadata/context/wb/sel'`

			`for fn in os.listdir(context_dir):`
			`with open(path.join(context_dir, fn)) as file_:`
			`text = file_.read()`
			`article = sgml_extract(text)`