spaCy/tests/test_onto_sgml_extract.py

32 lines
1.1 KiB
Python
Raw Normal View History

import pytest
import os
from os import path
from spacy.munge.read_ontonotes import sgml_extract
text_data = open(path.join(path.dirname(__file__), 'web_sample1.sgm')).read()
def test_example_extract():
article = sgml_extract(text_data)
assert article['docid'] == 'blogspot.com_alaindewitt_20060924104100_ENG_20060924_104100'
assert article['doctype'] == 'BLOG TEXT'
assert article['datetime'] == '2006-09-24T10:41:00'
assert article['headline'].strip() == 'Devastating Critique of the Arab World by One of Its Own'
assert article['poster'] == 'Alain DeWitt'
assert article['postdate'] == '2006-09-24T10:41:00'
assert article['text'].startswith('Thanks again to my fri'), article['text'][:10]
assert article['text'].endswith(' tide will turn."'), article['text'][-10:]
assert '<' not in article['text'], article['text'][:10]
def test_directory():
context_dir = '/usr/local/data/OntoNotes5/data/english/metadata/context/wb/sel'
for fn in os.listdir(context_dir):
with open(path.join(context_dir, fn)) as file_:
text = file_.read()
article = sgml_extract(text)