109 lines
3.7 KiB
Java
109 lines
3.7 KiB
Java
|
package FunctionLayer;
|
||
|
|
||
|
import java.util.Collections;
|
||
|
import java.util.List;
|
||
|
import java.util.Properties;
|
||
|
import java.util.Set;
|
||
|
import java.util.Arrays;
|
||
|
|
||
|
import edu.stanford.nlp.ling.CoreAnnotation;
|
||
|
import edu.stanford.nlp.pipeline.Annotator;
|
||
|
import org.apache.lucene.analysis.CharArraySet;
|
||
|
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||
|
|
||
|
import edu.stanford.nlp.ling.*;
|
||
|
import edu.stanford.nlp.pipeline.Annotation;
|
||
|
import edu.stanford.nlp.util.Pair;
|
||
|
import edu.stanford.nlp.util.ArraySet;
|
||
|
|
||
|
/**
|
||
|
* CoreNlp Annotator that checks if in coming token is a stopword
|
||
|
*
|
||
|
* @author John Conwell
|
||
|
* @author Paul Landes
|
||
|
*/
|
||
|
public class StopwordAnnotator implements Annotator, CoreAnnotation<Pair<Boolean, Boolean>> {
|
||
|
|
||
|
/**
|
||
|
* stopword annotator class name used in annotators property
|
||
|
*/
|
||
|
public static final String ANNOTATOR_CLASS = "stopword";
|
||
|
|
||
|
/**
|
||
|
* Property key to specify the comma delimited list of custom stopwords
|
||
|
*/
|
||
|
public static final String STOPWORDS_LIST = "stopword-list";
|
||
|
|
||
|
/**
|
||
|
* Property key to specify if stopword list is case insensitive
|
||
|
*/
|
||
|
public static final String IGNORE_STOPWORD_CASE = "ignore-stopword-case";
|
||
|
|
||
|
private static Class<? extends Pair> boolPair = Pair.makePair(true, true).getClass();
|
||
|
|
||
|
private Properties props;
|
||
|
private CharArraySet stopwords;
|
||
|
|
||
|
public StopwordAnnotator() {
|
||
|
this(new Properties());
|
||
|
}
|
||
|
|
||
|
public StopwordAnnotator(String notUsed, Properties props) {
|
||
|
this(props);
|
||
|
}
|
||
|
|
||
|
public StopwordAnnotator(Properties props) {
|
||
|
this.props = props;
|
||
|
if (this.props.containsKey(STOPWORDS_LIST)) {
|
||
|
String stopwordList = props.getProperty(STOPWORDS_LIST);
|
||
|
boolean ignoreCase = Boolean.parseBoolean(props.getProperty(IGNORE_STOPWORD_CASE, "false"));
|
||
|
this.stopwords = getStopWordList(stopwordList, ignoreCase);
|
||
|
} else {
|
||
|
this.stopwords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public void annotate(Annotation annotation) {
|
||
|
if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
|
||
|
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
|
||
|
for (CoreLabel token : tokens) {
|
||
|
boolean isWordStopword = stopwords.contains(token.word().toLowerCase());
|
||
|
boolean isLemmaStopword = stopwords.contains(token.lemma().toLowerCase());
|
||
|
Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword);
|
||
|
token.set(StopwordAnnotator.class, pair);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
|
||
|
return Collections.singleton(StopwordAnnotator.class);
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public Set<Class<? extends CoreAnnotation>> requires() {
|
||
|
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
|
||
|
CoreAnnotations.TextAnnotation.class,
|
||
|
CoreAnnotations.TokensAnnotation.class,
|
||
|
CoreAnnotations.LemmaAnnotation.class,
|
||
|
CoreAnnotations.PartOfSpeechAnnotation.class
|
||
|
)));
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
@SuppressWarnings("unchecked")
|
||
|
public Class<Pair<Boolean, Boolean>> getType() {
|
||
|
return (Class<Pair<Boolean, Boolean>>) boolPair;
|
||
|
}
|
||
|
|
||
|
public static CharArraySet getStopWordList(String stopwordList, boolean ignoreCase) {
|
||
|
String[] terms = stopwordList.split(",");
|
||
|
CharArraySet stopwordSet = new CharArraySet(terms.length, ignoreCase);
|
||
|
for (String term : terms) {
|
||
|
stopwordSet.add(term);
|
||
|
}
|
||
|
return CharArraySet.unmodifiableSet(stopwordSet);
|
||
|
}
|
||
|
}
|