mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	pull merge_sent into iob2docs to avoid Doc creation for each line
This commit is contained in:
		
							parent
							
								
									5cf3eeee0d
								
							
						
					
					
						commit
						351ab3a3d4
					
				| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .conll_ner2docs import n_sents_info
 | 
				
			||||||
from ...gold import iob_to_biluo, tags_to_entities
 | 
					from ...gold import iob_to_biluo, tags_to_entities
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
from .util import merge_sentences
 | 
					from ...util import minibatch
 | 
				
			||||||
from .conll_ner2docs import n_sents_info
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
 | 
					def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -19,31 +19,44 @@ def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
 | 
				
			||||||
    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
 | 
					    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    msg = Printer(no_print=no_print)
 | 
					    msg = Printer(no_print=no_print)
 | 
				
			||||||
    docs = read_iob(input_data.split("\n"), vocab)
 | 
					 | 
				
			||||||
    if n_sents > 0:
 | 
					    if n_sents > 0:
 | 
				
			||||||
        n_sents_info(msg, n_sents)
 | 
					        n_sents_info(msg, n_sents)
 | 
				
			||||||
        docs = merge_sentences(docs, n_sents)
 | 
					    docs = read_iob(input_data.split("\n"), vocab, n_sents)
 | 
				
			||||||
    return docs
 | 
					    return docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_iob(raw_sents, vocab):
 | 
					def read_iob(raw_sents, vocab, n_sents):
 | 
				
			||||||
    docs = []
 | 
					    docs = []
 | 
				
			||||||
    for line in raw_sents:
 | 
					    for group in minibatch(raw_sents, size=n_sents):
 | 
				
			||||||
 | 
					        tokens = []
 | 
				
			||||||
 | 
					        words = []
 | 
				
			||||||
 | 
					        tags = []
 | 
				
			||||||
 | 
					        iob = []
 | 
				
			||||||
 | 
					        sent_starts = []
 | 
				
			||||||
 | 
					        for line in group:
 | 
				
			||||||
            if not line.strip():
 | 
					            if not line.strip():
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
        tokens = [t.split("|") for t in line.split()]
 | 
					            sent_tokens = [t.split("|") for t in line.split()]
 | 
				
			||||||
        if len(tokens[0]) == 3:
 | 
					            if len(sent_tokens[0]) == 3:
 | 
				
			||||||
            words, tags, iob = zip(*tokens)
 | 
					                sent_words, sent_tags, sent_iob = zip(*sent_tokens)
 | 
				
			||||||
        elif len(tokens[0]) == 2:
 | 
					            elif len(sent_tokens[0]) == 2:
 | 
				
			||||||
            words, iob = zip(*tokens)
 | 
					                sent_words, sent_iob = zip(*sent_tokens)
 | 
				
			||||||
            tags = ["-"] * len(words)
 | 
					                sent_tags = ["-"] * len(sent_words)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                raise ValueError(
 | 
					                raise ValueError(
 | 
				
			||||||
                    "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
 | 
					                    "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					            words.extend(sent_words)
 | 
				
			||||||
 | 
					            tags.extend(sent_tags)
 | 
				
			||||||
 | 
					            iob.extend(sent_iob)
 | 
				
			||||||
 | 
					            tokens.extend(sent_tokens)
 | 
				
			||||||
 | 
					            sent_starts.append(True)
 | 
				
			||||||
 | 
					            sent_starts.extend([False for _ in sent_words[1:]])
 | 
				
			||||||
        doc = Doc(vocab, words=words)
 | 
					        doc = Doc(vocab, words=words)
 | 
				
			||||||
        for i, tag in enumerate(tags):
 | 
					        for i, tag in enumerate(tags):
 | 
				
			||||||
            doc[i].tag_ = tag
 | 
					            doc[i].tag_ = tag
 | 
				
			||||||
 | 
					        for i, sent_start in enumerate(sent_starts):
 | 
				
			||||||
 | 
					            doc[i].is_sent_start = sent_start
 | 
				
			||||||
        biluo = iob_to_biluo(iob)
 | 
					        biluo = iob_to_biluo(iob)
 | 
				
			||||||
        entities = tags_to_entities(biluo)
 | 
					        entities = tags_to_entities(biluo)
 | 
				
			||||||
        doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
 | 
					        doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +0,0 @@
 | 
				
			||||||
from spacy.util import minibatch
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def merge_sentences(docs, n_sents):
 | 
					 | 
				
			||||||
    merged = []
 | 
					 | 
				
			||||||
    for group in minibatch(docs, size=n_sents):
 | 
					 | 
				
			||||||
        raise NotImplementedError
 | 
					 | 
				
			||||||
    return merged
 | 
					 | 
				
			||||||
| 
						 | 
					@ -6,37 +6,18 @@ from ..tokens import Doc
 | 
				
			||||||
from .iob_utils import biluo_tags_from_offsets
 | 
					from .iob_utils import biluo_tags_from_offsets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def merge_sents(sents):
 | 
					def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | 
				
			||||||
    m_deps = [[], [], [], [], [], []]
 | 
					 | 
				
			||||||
    m_cats = {}
 | 
					 | 
				
			||||||
    m_brackets = []
 | 
					 | 
				
			||||||
    i = 0
 | 
					 | 
				
			||||||
    for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
 | 
					 | 
				
			||||||
        m_deps[0].extend(id_ + i for id_ in ids)
 | 
					 | 
				
			||||||
        m_deps[1].extend(words)
 | 
					 | 
				
			||||||
        m_deps[2].extend(tags)
 | 
					 | 
				
			||||||
        m_deps[3].extend(head + i for head in heads)
 | 
					 | 
				
			||||||
        m_deps[4].extend(labels)
 | 
					 | 
				
			||||||
        m_deps[5].extend(ner)
 | 
					 | 
				
			||||||
        m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
 | 
					 | 
				
			||||||
                          for b in brackets)
 | 
					 | 
				
			||||||
        m_cats.update(cats)
 | 
					 | 
				
			||||||
        i += len(ids)
 | 
					 | 
				
			||||||
    return [(m_deps, (m_cats, m_brackets))]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def docs_to_json(docs, id=0, ner_missing_tag="O"):
 | 
					 | 
				
			||||||
    """Convert a list of Doc objects into the JSON-serializable format used by
 | 
					    """Convert a list of Doc objects into the JSON-serializable format used by
 | 
				
			||||||
    the spacy train command.
 | 
					    the spacy train command.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    docs (iterable / Doc): The Doc object(s) to convert.
 | 
					    docs (iterable / Doc): The Doc object(s) to convert.
 | 
				
			||||||
    id (int): Id for the JSON.
 | 
					    doc_id (int): Id for the JSON.
 | 
				
			||||||
    RETURNS (dict): The data in spaCy's JSON format
 | 
					    RETURNS (dict): The data in spaCy's JSON format
 | 
				
			||||||
        - each input doc will be treated as a paragraph in the output doc
 | 
					        - each input doc will be treated as a paragraph in the output doc
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if isinstance(docs, Doc):
 | 
					    if isinstance(docs, Doc):
 | 
				
			||||||
        docs = [docs]
 | 
					        docs = [docs]
 | 
				
			||||||
    json_doc = {"id": id, "paragraphs": []}
 | 
					    json_doc = {"id": doc_id, "paragraphs": []}
 | 
				
			||||||
    for i, doc in enumerate(docs):
 | 
					    for i, doc in enumerate(docs):
 | 
				
			||||||
        json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
 | 
					        json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
 | 
				
			||||||
        for cat, val in doc.cats.items():
 | 
					        for cat, val in doc.cats.items():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -117,7 +117,6 @@ def test_cli_converters_conllu2json_subtokens():
 | 
				
			||||||
    assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 | 
					    assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					 | 
				
			||||||
def test_cli_converters_iob2json(en_vocab):
 | 
					def test_cli_converters_iob2json(en_vocab):
 | 
				
			||||||
    lines = [
 | 
					    lines = [
 | 
				
			||||||
        "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
 | 
					        "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
 | 
				
			||||||
| 
						 | 
					@ -127,19 +126,19 @@ def test_cli_converters_iob2json(en_vocab):
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    input_data = "\n".join(lines)
 | 
					    input_data = "\n".join(lines)
 | 
				
			||||||
    converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
 | 
					    converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
 | 
				
			||||||
 | 
					    assert len(converted_docs) == 1
 | 
				
			||||||
    converted = docs_to_json(converted_docs)
 | 
					    converted = docs_to_json(converted_docs)
 | 
				
			||||||
    assert len(converted) == 1
 | 
					    assert converted["id"] == 0
 | 
				
			||||||
    assert converted[0]["id"] == 0
 | 
					    assert len(converted["paragraphs"]) == 1
 | 
				
			||||||
    assert len(converted[0]["paragraphs"]) == 1
 | 
					    assert len(converted["paragraphs"][0]["sentences"]) == 4
 | 
				
			||||||
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
 | 
					 | 
				
			||||||
    for i in range(0, 4):
 | 
					    for i in range(0, 4):
 | 
				
			||||||
        sent = converted[0]["paragraphs"][0]["sentences"][i]
 | 
					        sent = converted["paragraphs"][0]["sentences"][i]
 | 
				
			||||||
        assert len(sent["tokens"]) == 8
 | 
					        assert len(sent["tokens"]) == 8
 | 
				
			||||||
        tokens = sent["tokens"]
 | 
					        tokens = sent["tokens"]
 | 
				
			||||||
        # fmt: off
 | 
					        # fmt: off
 | 
				
			||||||
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
					        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
				
			||||||
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
 | 
					
 | 
				
			||||||
        # fmt: on
 | 
					    assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					@pytest.mark.xfail
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user