mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			175 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			175 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| from wasabi import Printer
 | |
| 
 | |
| from ...gold import iob_to_biluo
 | |
| from ...lang.xx import MultiLanguage
 | |
| from ...tokens.doc import Doc
 | |
| from ...util import load_model
 | |
| 
 | |
| 
 | |
| def conll_ner2json(
 | |
|     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 | |
| ):
 | |
|     """
 | |
|     Convert files in the CoNLL-2003 NER format and similar
 | |
|     whitespace-separated columns into JSON format for use with train cli.
 | |
| 
 | |
|     The first column is the tokens, the final column is the IOB tags. If an
 | |
|     additional second column is present, the second column is the tags.
 | |
| 
 | |
|     Sentences are separated with whitespace and documents can be separated
 | |
|     using the line "-DOCSTART- -X- O O".
 | |
| 
 | |
|     Sample format:
 | |
| 
 | |
|     -DOCSTART- -X- O O
 | |
| 
 | |
|     I O
 | |
|     like O
 | |
|     London B-GPE
 | |
|     and O
 | |
|     New B-GPE
 | |
|     York I-GPE
 | |
|     City I-GPE
 | |
|     . O
 | |
| 
 | |
|     """
 | |
|     msg = Printer(no_print=no_print)
 | |
|     doc_delimiter = "-DOCSTART- -X- O O"
 | |
|     # check for existing delimiters, which should be preserved
 | |
|     if "\n\n" in input_data and seg_sents:
 | |
|         msg.warn(
 | |
|             "Sentence boundaries found, automatic sentence segmentation with "
 | |
|             "`-s` disabled."
 | |
|         )
 | |
|         seg_sents = False
 | |
|     if doc_delimiter in input_data and n_sents:
 | |
|         msg.warn(
 | |
|             "Document delimiters found, automatic document segmentation with "
 | |
|             "`-n` disabled."
 | |
|         )
 | |
|         n_sents = 0
 | |
|     # do document segmentation with existing sentences
 | |
|     if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
 | |
|         n_sents_info(msg, n_sents)
 | |
|         input_data = segment_docs(input_data, n_sents, doc_delimiter)
 | |
|     # do sentence segmentation with existing documents
 | |
|     if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
 | |
|         input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
 | |
|     # do both sentence segmentation and document segmentation according
 | |
|     # to options
 | |
|     if "\n\n" not in input_data and doc_delimiter not in input_data:
 | |
|         # sentence segmentation required for document segmentation
 | |
|         if n_sents > 0 and not seg_sents:
 | |
|             msg.warn(
 | |
|                 "No sentence boundaries found to use with option `-n {}`. "
 | |
|                 "Use `-s` to automatically segment sentences or `-n 0` "
 | |
|                 "to disable.".format(n_sents)
 | |
|             )
 | |
|         else:
 | |
|             n_sents_info(msg, n_sents)
 | |
|             input_data = segment_sents_and_docs(
 | |
|                 input_data, n_sents, doc_delimiter, model=model, msg=msg
 | |
|             )
 | |
|     # provide warnings for problematic data
 | |
|     if "\n\n" not in input_data:
 | |
|         msg.warn(
 | |
|             "No sentence boundaries found. Use `-s` to automatically segment "
 | |
|             "sentences."
 | |
|         )
 | |
|     if doc_delimiter not in input_data:
 | |
|         msg.warn(
 | |
|             "No document delimiters found. Use `-n` to automatically group "
 | |
|             "sentences into documents."
 | |
|         )
 | |
|     output_docs = []
 | |
|     for doc in input_data.strip().split(doc_delimiter):
 | |
|         doc = doc.strip()
 | |
|         if not doc:
 | |
|             continue
 | |
|         output_doc = []
 | |
|         for sent in doc.split("\n\n"):
 | |
|             sent = sent.strip()
 | |
|             if not sent:
 | |
|                 continue
 | |
|             lines = [line.strip() for line in sent.split("\n") if line.strip()]
 | |
|             cols = list(zip(*[line.split() for line in lines]))
 | |
|             if len(cols) < 2:
 | |
|                 raise ValueError(
 | |
|                     "The token-per-line NER file is not formatted correctly. "
 | |
|                     "Try checking whitespace and delimiters. See "
 | |
|                     "https://spacy.io/api/cli#convert"
 | |
|                 )
 | |
|             words = cols[0]
 | |
|             iob_ents = cols[-1]
 | |
|             if len(cols) > 2:
 | |
|                 tags = cols[1]
 | |
|             else:
 | |
|                 tags = ["-"] * len(words)
 | |
|             biluo_ents = iob_to_biluo(iob_ents)
 | |
|             output_doc.append(
 | |
|                 {
 | |
|                     "tokens": [
 | |
|                         {"orth": w, "tag": tag, "ner": ent}
 | |
|                         for (w, tag, ent) in zip(words, tags, biluo_ents)
 | |
|                     ]
 | |
|                 }
 | |
|             )
 | |
|         output_docs.append(
 | |
|             {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
 | |
|         )
 | |
|         output_doc = []
 | |
|     return output_docs
 | |
| 
 | |
| 
 | |
| def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
 | |
|     sentencizer = None
 | |
|     if model:
 | |
|         nlp = load_model(model)
 | |
|         if "parser" in nlp.pipe_names:
 | |
|             msg.info("Segmenting sentences with parser from model '{}'.".format(model))
 | |
|             sentencizer = nlp.get_pipe("parser")
 | |
|     if not sentencizer:
 | |
|         msg.info(
 | |
|             "Segmenting sentences with sentencizer. (Use `-b model` for "
 | |
|             "improved parser-based sentence segmentation.)"
 | |
|         )
 | |
|         nlp = MultiLanguage()
 | |
|         sentencizer = nlp.create_pipe("sentencizer")
 | |
|     lines = doc.strip().split("\n")
 | |
|     words = [line.strip().split()[0] for line in lines]
 | |
|     nlpdoc = Doc(nlp.vocab, words=words)
 | |
|     sentencizer(nlpdoc)
 | |
|     lines_with_segs = []
 | |
|     sent_count = 0
 | |
|     for i, token in enumerate(nlpdoc):
 | |
|         if token.is_sent_start:
 | |
|             if n_sents and sent_count % n_sents == 0:
 | |
|                 lines_with_segs.append(doc_delimiter)
 | |
|             lines_with_segs.append("")
 | |
|             sent_count += 1
 | |
|         lines_with_segs.append(lines[i])
 | |
|     return "\n".join(lines_with_segs)
 | |
| 
 | |
| 
 | |
| def segment_docs(input_data, n_sents, doc_delimiter):
 | |
|     sent_delimiter = "\n\n"
 | |
|     sents = input_data.split(sent_delimiter)
 | |
|     docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
 | |
|     input_data = ""
 | |
|     for doc in docs:
 | |
|         input_data += sent_delimiter + doc_delimiter
 | |
|         input_data += sent_delimiter.join(doc)
 | |
|     return input_data
 | |
| 
 | |
| 
 | |
| def n_sents_info(msg, n_sents):
 | |
|     msg.info("Grouping every {} sentences into a document.".format(n_sents))
 | |
|     if n_sents == 1:
 | |
|         msg.warn(
 | |
|             "To generate better training data, you may want to group "
 | |
|             "sentences into documents with `-n 10`."
 | |
|         )
 |