Java源码示例:edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation
示例1
private static void usingStanfordPOSTagger() {
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos");
props.put("pos.model", "C:\\Current Books in Progress\\NLP and Java\\Models\\english-caseless-left3words-distsim.tagger");
props.put("pos.maxlen", 10);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document = new Annotation(theSentence);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
String word = token.get(TextAnnotation.class);
String pos = token.get(PartOfSpeechAnnotation.class);
System.out.print(word + "/" + pos + " ");
}
System.out.println();
try {
pipeline.xmlPrint(document, System.out);
pipeline.prettyPrint(document, System.out);
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
示例2
public boolean isCEE(String text){
text = text.replace("/", " / ");
Annotation annotation = new Annotation(text);
pipeline.annotate(annotation);
List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
boolean flag=false;
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
String word = token.get(TextAnnotation.class);//token.get(LemmaAnnotation.class);//TextAnnotation.class
String pos = token.get(PartOfSpeechAnnotation.class);
//String lemma = token.get(LemmaAnnotation.class);
boolean f = false;
if ((word.equals("and") || word.equals(",") || word.equals("/") || word.equals("or"))) {
flag = true;
break;
}
}
}
return flag;
}
示例3
@Override
public boolean incrementToken() {
clearAttributes();
while (tokens == null || !tokens.hasNext())
if (!getNextSentence())
return false;
CoreLabel token = tokens.next();
// Use the lemmatized word:
String word = token.get(LemmaAnnotation.class);
if (word == null) { // Fallback when no lemmatization happens.
word = token.get(TextAnnotation.class);
}
termAttribute.setLength(0);
termAttribute.append(word);
// NER or part of speech annotation
String pos = token.get(NamedEntityTagAnnotation.class);
pos = (pos == null || "O".equals(pos)) ? token.get(PartOfSpeechAnnotation.class) : pos;
typeAttribute.setType(pos != null ? pos : TypeAttribute.DEFAULT_TYPE);
// Token character offsets
int be = token.get(CharacterOffsetBeginAnnotation.class).intValue();
int en = token.get(CharacterOffsetEndAnnotation.class).intValue();
offsetAttribute.setOffset(be, en);
// Token in-document position increment:
positionAttribute.setPositionIncrement(1 + skippedTokens);
skippedTokens = 0;
return true;
}
示例4
private static void usingStanfordPipelineParallel() {
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
String path = "C:\\Current Books\\NLP and Java\\Downloads\\stanford-ner-2014-10-26\\classifiers";
props.put("ner.model", path + "/english.muc.7class.distsim.crf.ser.gz");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation annotation1 = new Annotation("The robber took the cash and ran.");
Annotation annotation2 = new Annotation("The policeman chased him down the street.");
Annotation annotation3 = new Annotation("A passerby, watching the action, tripped the thief as he passed by.");
Annotation annotation4 = new Annotation("They all lived happily everafter, except for the thief of course.");
ArrayList<Annotation> list = new ArrayList();
list.add(annotation1);
list.add(annotation2);
list.add(annotation3);
list.add(annotation4);
Iterable<Annotation> iterable = list;
pipeline.annotate(iterable);
System.out.println("Total time: " + pipeline.timingInformation());
List<CoreMap> sentences = annotation2.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
String word = token.get(TextAnnotation.class);
String pos = token.get(PartOfSpeechAnnotation.class);
System.out.println("Word: " + word + " POS Tag: " + pos);
}
}
}
示例5
public Word[] getTaggedWords (String sentence) {
CoreMap taggedSentence = getPOS(sentence);
Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()];
int count = 0;
for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
//System.out.println(word+"["+pos+"]");
ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1);
count ++;
}
return ret;
}
示例6
public static Annotation reconstructStanfordAnnotations(Span sentenceSpan, HashMap<Integer, Word> wordIndex, boolean useWordOrderInsteadOfOffset){
String originalText = sentenceSpan.getAnnotation("text", String.class);
Annotation a = new Annotation(originalText);
a.set(TextAnnotation.class, originalText);
//a.set(DocIDAnnotation.class, "document");
List<CoreMap> sentenceAnnotations = new ArrayList<CoreMap>();
a.set(SentencesAnnotation.class, sentenceAnnotations);
List<CoreLabel> tokenAnnotations = new ArrayList<CoreLabel>();
a.set(TokensAnnotation.class, tokenAnnotations);
ArrayCoreMap sentenceAnnotation = new ArrayCoreMap();
sentenceAnnotations.add(sentenceAnnotation);
// int startOffset = sentenceSpan.first().getStartOffset();
for (Word w : sentenceSpan){
CoreLabel c = new CoreLabel();
c.set(TextAnnotation.class, w.getWord());
c.set(OriginalTextAnnotation.class, w.getWord());
c.set(ValueAnnotation.class, w.getWord());
c.set(CharacterOffsetBeginAnnotation.class, w.getStartOffset());
c.set(CharacterOffsetEndAnnotation.class, w.getEndOffset());
c.set(IndexAnnotation.class, w.getOrder()+1);
// c.setIndex(w.getOrder());
c.set(SentenceIndexAnnotation.class, 0);
// c.setSentIndex(0);
c.set(DocIDAnnotation.class, "document");
c.setDocID("document");
if (w.hasAnnotation("pos"))
c.set(PartOfSpeechAnnotation.class, w.getAnnotation("pos",String.class));
if (w.hasAnnotation("lemma"))
c.set(LemmaAnnotation.class, w.getAnnotation("lemma", String.class));
if (w.hasAnnotation("nerLabel"))
c.set(NamedEntityTagAnnotation.class, w.getAnnotation("nerLabel", String.class));
if (w.hasAnnotation("nerValue"))
c.set(NormalizedNamedEntityTagAnnotation.class, w.getAnnotation("nerValue", String.class));
tokenAnnotations.add(c);
if (useWordOrderInsteadOfOffset){
wordIndex.put(w.getOrder(), w);
} else {
wordIndex.put(w.getStartOffset(), w);
}
}
//essential sentence annotation: TokensAnnotation
sentenceAnnotation.set(TokensAnnotation.class, tokenAnnotations);
//essential sentence annotation: TextAnnotation
sentenceAnnotation.set(TextAnnotation.class, originalText);
//essential sentence annotation: SentenceIndexAnnotation
sentenceAnnotation.set(SentenceIndexAnnotation.class, 0);
sentenceAnnotation.set(CharacterOffsetBeginAnnotation.class, 0);
sentenceAnnotation.set(CharacterOffsetEndAnnotation.class, sentenceSpan.last().getEndOffset());
sentenceAnnotation.set(TokenBeginAnnotation.class, 0);
sentenceAnnotation.set(TokenEndAnnotation.class, sentenceSpan.last().getOrder());
return a;
}
示例7
/**
* Process the Dataset in chunks, as defined by the <code>spanType</code> parameter.
* The Spans denoted by spanType must each contain Words belonging to a single sentence.
*
*/
@Override
public void validatedProcess(Dataset dataset, String spanTypeOfSentenceUnit){
// if (dataset.getPerformedNLPTasks().contains(getTask())){
// Framework.error("This dataset has already been tagged with POS.");
// return;
// }
//check if prerequisites are satisfied
if (!dataset.getPerformedNLPTasks().containsAll(prerequisites)){
HashSet<NLPTask> missingTasks = new HashSet<>();
missingTasks.addAll(prerequisites);
missingTasks.removeAll(dataset.getPerformedNLPTasks());
Framework.error("This dataset does not meet the requirements to use this component! Missing tasks: " + missingTasks);
return;
}
Properties prop1 = new Properties();
prop1.setProperty("annotators", "pos");
StanfordCoreNLP pipeline = new StanfordCoreNLP(prop1, false);
for (Span span : dataset.getSpans(spanTypeOfSentenceUnit)){
HashMap<Integer, Word> wordIndex = new HashMap<>();
Annotation a = CoreNLPHelper.reconstructStanfordAnnotations(span, wordIndex);
if (a == null){
System.out.println(a);
}
pipeline.annotate(a);
List<CoreMap> sentenceAnnotations = a.get(SentencesAnnotation.class);
for (CoreMap sentence : sentenceAnnotations){
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
Word w = wordIndex.get(token.get(CharacterOffsetBeginAnnotation.class));
String tempPos = token.get(PartOfSpeechAnnotation.class);
if (w.hasAnnotation("URI")){
w.putAnnotation("pos", "NNP");
} else {
w.putAnnotation("pos", tempPos);
}
// System.out.println(w.getAnnotations());
}
}
}
}
示例8
/** annotator is a stanford corenlp notion. */
void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, String annotator) {
switch(annotator) {
case "tokenize":
case "cleanxml":
case "ssplit":
break;
case "pos":
addTokenAnno(sent_info,sentence, "pos", PartOfSpeechAnnotation.class);
break;
case "lemma":
addTokenAnno(sent_info,sentence, "lemmas", LemmaAnnotation.class);
break;
case "ner":
addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class);
break;
case "regexner":
addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
break;
case "sentiment": throw new RuntimeException("TODO");
case "truecase": throw new RuntimeException("TODO");
case "parse":
addParseTree(sent_info,sentence);
addDepsCC(sent_info,sentence);
addDepsBasic(sent_info,sentence);
break;
case "depparse":
addDepsCC(sent_info,sentence);
addDepsBasic(sent_info,sentence);
break;
case "dcoref":
break;
case "relation": throw new RuntimeException("TODO");
case "natlog": throw new RuntimeException("TODO");
case "quote": throw new RuntimeException("TODO");
case "entitymentions":
addEntityMentions(sent_info, sentence);
break;
default:
throw new RuntimeException("don't know how to handle annotator " + annotator);
}
}
示例9
/**
* Process an English text file.
*
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.printf("Usage: java %s file [inputproperties_str] > json_output%n", CoreNLPToJSON.class.getName());
System.exit(-1);
}
String textFile = args[0];
InputProperties inputProperties = args.length > 1 ? InputProperties.fromString(args[1]) : new InputProperties();
StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
// Configure tokenizer
EnglishPreprocessor preprocessor = new EnglishPreprocessor(true);
// Use a map with ordered keys so that the output is ordered by segmentId.
Map<Integer,SourceSegment> annotations = new TreeMap<Integer,SourceSegment>();
LineNumberReader reader = IOTools.getReaderFromFile(textFile);
for (String line; (line = reader.readLine()) != null;) {
Annotation annotation = coreNLP.process(line);
List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
if (sentences.size() != 1) {
throw new RuntimeException("Sentence splitting on line: " + String.valueOf(reader.getLineNumber()));
}
CoreMap sentence = sentences.get(0);
Tree tree = sentence.get(TreeAnnotation.class);
tree.indexLeaves();
int[] chunkVector = getChunkVector(tree);
List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
int numTokens = tokens.size();
SymmetricalWordAlignment alignment = preprocessor.processAndAlign(line);
if (alignment.e().size() != numTokens) {
throw new RuntimeException(String.format("Tokenizer configurations differ: %d/%d", alignment.e().size(), numTokens));
}
SourceSegment segment = new SourceSegment(numTokens);
segment.layoutSpec.addAll(makeLayoutSpec(alignment));
segment.inputProperties = inputProperties.toString();
for (int j = 0; j < numTokens; ++j) {
CoreLabel token = tokens.get(j);
String word = token.get(TextAnnotation.class);
segment.tokens.add(unescape(word));
String pos = mapPOS(token.get(PartOfSpeechAnnotation.class));
segment.pos.add(pos);
String ne = token.get(NamedEntityTagAnnotation.class);
segment.ner.add(ne);
segment.chunkVector[j] = chunkVector[j];
}
annotations.put(reader.getLineNumber()-1, segment);
}
reader.close();
System.err.printf("Processed %d sentences%n", reader.getLineNumber());
final SourceDocument jsonDocument = new SourceDocument(textFile, annotations);
// Convert to json
Gson gson = new Gson();
String json = gson.toJson(jsonDocument);
System.out.println(json);
}