mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			54 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			54 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import srsly
 | 
						|
 | 
						|
from ...gold import docs_to_json
 | 
						|
from ...util import get_lang_class, minibatch
 | 
						|
 | 
						|
 | 
						|
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
 | 
						|
    if lang is None:
 | 
						|
        raise ValueError("No --lang specified, but tokenization required")
 | 
						|
    json_docs = []
 | 
						|
    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
 | 
						|
    nlp = get_lang_class(lang)()
 | 
						|
    sentencizer = nlp.create_pipe("sentencizer")
 | 
						|
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
 | 
						|
        docs = []
 | 
						|
        for record in batch:
 | 
						|
            raw_text = record["text"]
 | 
						|
            if "entities" in record:
 | 
						|
                ents = record["entities"]
 | 
						|
            else:
 | 
						|
                ents = record["spans"]
 | 
						|
            ents = [(e["start"], e["end"], e["label"]) for e in ents]
 | 
						|
            doc = nlp.make_doc(raw_text)
 | 
						|
            sentencizer(doc)
 | 
						|
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
 | 
						|
            doc.ents = _cleanup_spans(spans)
 | 
						|
            docs.append(doc)
 | 
						|
        json_docs.append(docs_to_json(docs, id=i))
 | 
						|
    return json_docs
 | 
						|
 | 
						|
 | 
						|
def _cleanup_spans(spans):
 | 
						|
    output = []
 | 
						|
    seen = set()
 | 
						|
    for span in spans:
 | 
						|
        if span is not None:
 | 
						|
            # Trim whitespace
 | 
						|
            while len(span) and span[0].is_space:
 | 
						|
                span = span[1:]
 | 
						|
            while len(span) and span[-1].is_space:
 | 
						|
                span = span[:-1]
 | 
						|
            if not len(span):
 | 
						|
                continue
 | 
						|
            for i in range(span.start, span.end):
 | 
						|
                if i in seen:
 | 
						|
                    break
 | 
						|
            else:
 | 
						|
                output.append(span)
 | 
						|
                seen.update(range(span.start, span.end))
 | 
						|
    return output
 |