Java源码示例:edu.stanford.nlp.ling.Word
示例1
@Override
public String[] tokenize(String sentence) {
Reader r = new StringReader(sentence);
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
List<String> l = new ArrayList<String>();
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
l.add(w.word());
}
String[] tok = new String[l.size() + 1];
tok[0] = is2.io.CONLLReader09.ROOT;
int i = 1;
for (String s : l)
tok[i++] = s;
return tok;
}
示例2
public StringInText[] tokenizeplus(String sentence) {
Reader r = new StringReader(sentence);
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
List<StringInText> l = new ArrayList<StringInText>();
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
.endPosition() + startpos));
}
StringInText[] tok = new StringInText[l.size() + 1];
tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0);
int i = 1;
for (StringInText s : l)
tok[i++] = s;
startpos += (1 + sentence.length());
return tok;
}
示例3
/**
* Parses a sentence and returns the PCFG score as a confidence measure.
*
* @param sentence
* a sentence
* @return PCFG score
*/
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce PCFG score
log.debug("Parsing sentence");
double score;
synchronized (parser)
{
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
score = parser.getPCFGScore();
}
return score;
}
示例4
public List<WordLemmaTag> processSentence(String sentence, boolean isTokenized)
{
final StanfordLemmatizer lemmatizer = StanfordLemmatizer.getInstance();
final StanfordPOSTagger tagger = StanfordPOSTagger.getInstance();
final List<WordLemmaTag> tlSentence = new ArrayList<WordLemmaTag>();
// the tagged sentence
List<TaggedWord> tSentence = null;
if (isTokenized) tSentence = tagger.tag(sentence);
else
{
StanfordTokenizer tokenizer = StanfordTokenizer.getInstance();
List<Word> tokens = tokenizer.tokenize(sentence);
tSentence = tagger.tag(tokens);
}
// add to the lemmatized sentence
for (TaggedWord tw : tSentence)
tlSentence.add(lemmatizer.lemmatize(tw));
return tlSentence;
}
示例5
public List<Word> tokenize(String string)
{
this.tokenizer =
new PTBTokenizer<Word>(
new StringReader(string),
new WordTokenFactory(),
"untokenizable=noneDelete,ptb3Escaping=true");
try
{
return tokenizer.tokenize();
}
catch (Exception e)
{
System.err.println(e.getMessage());
final List<Word> tokens = new ArrayList<Word>();
for (String token : pennTokenizer.tokenize(string).split("\\s+"))
{
tokens.add(new Word(token));
}
return tokens;
}
}
示例6
/**
* Parses a sentence and returns a string representation of the parse tree.
*
* @param sentence
* a sentence
* @return Tree whose Label is a MapLabel containing correct begin and end
* character offsets in keys BEGIN_KEY and END_KEY
*/
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce stanford Tree
log.debug("Parsing sentence");
Tree tree = null;
synchronized (parser)
{
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
tree = parser.getBestParse();
}
// label tree with character extents
// log.debug("Setting character extents");
// updateTreeLabels(tree, tree, new MutableInteger(), new
// MutableInteger(-1));
// log.debug("Creating offset mapping");
// List<RangeMap> mapping = createMapping(sentence);
// log.debug(mapping.toString());
// log.debug("Applying offset mapping");
// mapOffsets(tree, mapping);
return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
示例7
/**
* Combines the tokens into a <code>Sentence</code>
*
* @param tokens
* @return <code>Sentence</code> made of the tokens
*/
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
ArrayList<HasWord> wordList = new ArrayList<HasWord>();
for (String s : tokens) {
HasWord w = new Word(s);
wordList.add(w);
}
Sentence sentence = new Sentence();
sentence.setWords(wordList);
return sentence;
}
示例8
public List<String> tokenizeString(String string)
{
final List<String> tokens = new ArrayList<String>();
for (Word w : tokenize(string))
{
tokens.add(w.word());
}
return tokens;
}
示例9
@Override
public List<InputWord> tag(final List<InputWord> input) {
return tagger.tagSentence(input.stream().map(w -> new Word(w.word)).collect(Collectors.toList())).stream()
.map(w -> new InputWord(w.word(), w.tag(), null)).collect(Collectors.toList());
}