Hi There,
I'm trying to create a pipeline component for which identifies terms to describe an "outgroup". In the following sentence, for example, an "outgroup" would be "Taliban Regime". The outgroup in this instance is based on a named entity (Taliban) and named concept (Regime).
On my orders, the United States military has begun strikes against Al Qaeda terrorist training camps and military installations of the Taliban regime in Afghanistan.
The pipeline has a component called concept recognition which uses a markup schema to annotate tokens and spans with a custom attribute and creates a doc extension of "named_concepts"
I am now trying to create final pipeline component for identifying outgroups based on a pattern combining "ENT_TYPE" and the custom attribute, All the pipeline components have been tested and are working as expected, however, I can't seem to get the pattern matching to work.
The problem seems to be in writing the correct pattern, would you be able to let me know where I'm going wrong please.
Code below:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Doc
import pipeline # module for custom pipeline components
nlp = spacy.load("en_core_web_sm")
for component in nlp.pipe_names:
if component not in ['tagger', "parser", "ner"]:
nlp.remove_pipe(component)
# add named entity matcher component to pipeline
nlp.add_pipe(pipeline.EntityMatcher(nlp), before = "ner") # top up on named entities
# add merge entities
merge_ents = nlp.create_pipe("merge_entities")
nlp.add_pipe(merge_ents, after = "ner")
# add concept matcher component to pipeline
nlp.add_pipe(pipeline.ConceptMatcher(nlp), after = "merge_entities") # add concepts
print(nlp.pipe_names)
# ['tagger', 'parser', 'Named Entity Matcher', 'ner', 'merge_entities', 'Concept Matcher']
class group_id(object):
name = "group id"
GROUP = ["NORP", "GPE", "ORG", "PERSON"]
def __init__(self, nlp):
self.nlp = nlp
Doc.set_extension("outgroup_entities", default = [], force = True)
self.outgroups = Matcher(nlp.vocab)
self.outgroups.add("OUTGROUP", None,
# this pattern works
[{'ENT_TYPE': {"IN" : group_id.GROUP}}])
# none of these patterns work
#[{'ENT_TYPE': {"IN" : group_id.GROUP}}, {"_" : {"ATTRIBUTE" : {"IN" : ["outgroup"]}}}])
#[{'ENT_TYPE': {"IN" : group_id.GROUP}}, {"_" : {"ATTRIBUTE" : "outgroup"}}])
#[{"_" : {"ATTRIBUTE" : "outgroup"}}])
#[{"_" : {"ATTRIBUTE" : {"IN" : ["outgroup"]}}}])
def __call__(self, doc):
# prints correct output confirming named entities extension is working
# named entities: [the United States, Al Qaeda, Taliban, Afghanistan]
print("named entities: ", [ent for ent in doc.ents])
# prints correct output confirming named concepts extension is working
# outgroup concepts: [terrorist, regime]
print("outgroup concepts: ", [concept for concept in doc._.named_concepts if concept._.ATTRIBUTE == "outgroup"])
with doc.retokenize() as retokenizer:
matches = self.outgroups(doc)
for match_id, start, end in matches:
span = Span(doc, start, end)
#returns results for the "ENT_TYPE" pattern but not for patterns trying to access custom attribute
print(self.nlp.vocab.strings[match_id], start, end, span.text)
doc._.outgroup_entities = list(doc._.outgroup_entities) + [span]
return doc
if "group id" in nlp.pipe_names:
nlp.remove_pipe("group id")
nlp.add_pipe(group_id(nlp), last = True)
text = "On my orders, the United States military has begun strikes against Al Qaeda
terrorist training camps and military installations of the Taliban regime in Afghanistan."
output = nlp(text)
print(output._.outgroup_entities)
# ENT_TYPE pattern only returns: [the United States, Al Qaeda, Taliban, Afghanistan]
# when custom attribute pattern used returns empty list