fixing pre discord launch calcaulations, implementation of JMWE coreNLP extension, hopefully fixed repetetive data in multithreading

This commit is contained in:
jenzur 2019-03-15 01:32:06 +01:00
parent 43da2dd5d5
commit 232190d076
3 changed files with 217 additions and 50 deletions

View File

@ -50,7 +50,7 @@ import java.util.logging.Logger;
* @author install1
*/
public class MYSQLDatahandler {
public static final long EXPIRE_TIME_IN_SECONDS = TimeUnit.SECONDS.convert(6, TimeUnit.MINUTES);
public static final long EXPIRE_TIME_IN_SECONDS1 = TimeUnit.SECONDS.convert(10, TimeUnit.HOURS);
public static MYSQLDatahandler instance = new MYSQLDatahandler();
@ -60,107 +60,119 @@ public class MYSQLDatahandler {
private LinkedHashMap<String, LinkedHashMap<String, Double>> lHMSMX = new LinkedHashMap();
private final Stopwatch stopwatch;
private final Stopwatch stopwatch1;
private static String modelPath = "edu/stanford/nlp/models/srparser/englishSR.ser.gz";
private ExecutorService executor;
private static String shiftReduceParserPath = "edu/stanford/nlp/models/srparser/englishSR.ser.gz";
private static String sentimentModel = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
private static String lexParserEnglishRNN = "edu/stanford/nlp/models/lexparser/englishRNN.ser.gz";
private static String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
private static String NERModel = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
private static String nerModel = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
private static String jmweIndexData = "/home/javatests/lib/mweindex_wordnet3.0_semcor1.6.data"; // ./lib/mweindex_wordnet3.0_semcor1.6.data
private static MaxentTagger tagger;
private static ShiftReduceParser model;
private static String[] options = {"-maxLength", "100"};
private static Properties props = new Properties();
private static Properties propsSentiment = new Properties();
private static Properties propsJMWE = new Properties();
private static GrammaticalStructureFactory gsf;
private static LexicalizedParser lp;
private static TreebankLanguagePack tlp;
private static AbstractSequenceClassifier<CoreLabel> classifier;
private ExecutorService executor;
// set up Stanford CoreNLP pipeline
private static StanfordCoreNLP pipeline;
private static StanfordCoreNLP pipelineSentiment;
private static StanfordCoreNLP pipelineJMWE;
public static AbstractSequenceClassifier<CoreLabel> getClassifier() {
return classifier;
}
public static void setClassifier(AbstractSequenceClassifier<CoreLabel> classifier) {
MYSQLDatahandler.classifier = classifier;
}
public void instantiateExecutor() {
this.executor = new ForkJoinPool(Runtime.getRuntime().availableProcessors(),
ForkJoinPool.defaultForkJoinWorkerThreadFactory,
null, true);
}
public MYSQLDatahandler() {
this.stopwatch = Stopwatch.createUnstarted();
this.stopwatch1 = Stopwatch.createStarted();
this.stringCache = new MapMaker().concurrencyLevel(2).makeMap();
}
public static void shiftReduceParserInitiate() {
try {
classifier = CRFClassifier.getClassifierNoExceptions(NERModel);
classifier = CRFClassifier.getClassifierNoExceptions(nerModel);
} catch (ClassCastException ex) {
Logger.getLogger(MYSQLDatahandler.class.getName()).log(Level.SEVERE, null, ex);
}
model = ShiftReduceParser.loadModel(modelPath, options);
model = ShiftReduceParser.loadModel(shiftReduceParserPath, options);
tagger = new MaxentTagger(taggerPath);
lp = LexicalizedParser.loadModel(lexParserEnglishRNN, options);
tlp = lp.getOp().langpack();
gsf = tlp.grammaticalStructureFactory();
// set up pipeline properties
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,parse");
props.setProperty("parse.model", modelPath);
props.setProperty("parse.model", shiftReduceParserPath);
props.setProperty("parse.maxlen", "100");
props.setProperty("tokenize.maxlen", "100");
props.setProperty("ssplit.maxlen", "100");
props.setProperty("lemma.maxlen", "100");
props.setProperty("parse.binaryTrees", "true");
propsSentiment.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,parse");
propsSentiment.setProperty("parse.model", lexParserEnglishRNN);
propsSentiment.setProperty("ner.model", nerModel);
propsSentiment.setProperty("sentiment.model", sentimentModel);
propsSentiment.setProperty("sentiment.maxlen", "100");
propsSentiment.setProperty("parse.maxlen", "100");
propsSentiment.setProperty("tokenize.maxlen", "100");
propsSentiment.setProperty("ssplit.maxlen", "100");
propsSentiment.setProperty("annotators", "tokenize,ssplit,pos,parse,depparse,sentiment"); //coref too expensive memorywise
propsJMWE.setProperty("customAnnotatorClass.jmwe", "edu.stanford.nlp.pipeline.JMWEAnnotator");
propsJMWE.setProperty("customAnnotatorClass.jmwe.verbose", "false");
propsJMWE.setProperty("customAnnotatorClass.jmwe.underscoreReplacement", "-");
propsJMWE.setProperty("customAnnotatorClass.jmwe.indexData", jmweIndexData);
propsJMWE.setProperty("customAnnotatorClass.jmwe.detector", "Exhaustive");
//"Consecutive", "Exhaustive", "ProperNouns", "Complex" and "CompositeConsecutiveProperNouns"
propsJMWE.setProperty("annotators", "tokenize, ssplit, pos, lemma, jmwe");
// set up pipeline
pipeline = new StanfordCoreNLP(props);
pipelineSentiment = new StanfordCoreNLP(propsSentiment);
pipelineJMWE = new StanfordCoreNLP(propsJMWE);
}
public static StanfordCoreNLP getPipelineJMWE() {
return pipelineJMWE;
}
public static GrammaticalStructureFactory getGsf() {
return gsf;
}
public static StanfordCoreNLP getPipeline() {
return pipeline;
}
public static StanfordCoreNLP getPipelineSentiment() {
return pipelineSentiment;
}
public static MaxentTagger getTagger() {
return tagger;
}
public static ShiftReduceParser getModel() {
return model;
}
private Map<Integer, String> getCache() throws SQLException, IOException, CustomError {
return DataMapper.getAllStrings();
}
public int getlHMSMXSize() {
return lHMSMX.size();
}
public int getstringCacheSize() {
return stringCache.size();
}
public void initiateMYSQL() throws SQLException, IOException {
try {
DataMapper.createTables();
@ -171,7 +183,7 @@ public class MYSQLDatahandler {
.getName()).log(Level.SEVERE, null, ex);
}
}
public synchronized void checkIfUpdateMatrixes() {
refreshMatrixFromDB = false;
int counter = 0;
@ -208,9 +220,8 @@ public class MYSQLDatahandler {
selectUpdate = secondaryIterator;
secondaryIterator++;
}
int beginindex = selectUpdate;
ConcurrentMap<Integer, String> strIndexNavigator = new MapMaker().concurrencyLevel(2).makeMap();
String get = stringCachelocal.getOrDefault(beginindex, null);
String get = stringCachelocal.getOrDefault(selectUpdate, null);
if (get == null) {
get = stringCachelocal.get(new Random().nextInt(stringCachelocal.size() - 1));
}
@ -220,7 +231,7 @@ public class MYSQLDatahandler {
strIndexNavigator.values().forEach((str) -> {
stringCachelocal.values().stream().filter((str1) -> (!str.equals(str1))).forEachOrdered((str1) -> {
boolean present = false;
LinkedHashMap<String, Double> orDefault = LHMSMXLocal.getOrDefault(str, null);
LinkedHashMap<String, Double> orDefault = lHMSMX.getOrDefault(str, null);
if (orDefault != null) {
Double orDefault1 = orDefault.getOrDefault(str1, null);
if (orDefault1 != null) {
@ -229,7 +240,7 @@ public class MYSQLDatahandler {
}
}
if (!present) {
orDefault = LHMSMXLocal.getOrDefault(str1, null);
orDefault = lHMSMX.getOrDefault(str1, null);
if (orDefault != null) {
Double orDefault1 = orDefault.getOrDefault(str, null);
if (orDefault1 != null) {
@ -242,12 +253,12 @@ public class MYSQLDatahandler {
SimilarityMatrix SMX = new SimilarityMatrix(str, str1);
Callable<SimilarityMatrix> worker = new SentimentAnalyzerTest(str, str1, SMX);
futures.put(futures.size() + 1, executor.submit(worker));
LinkedHashMap<String, Double> orDefault1 = LHMSMXLocal.getOrDefault(str, null);
LinkedHashMap<String, Double> orDefault1 = lHMSMX.getOrDefault(str, null);
if (orDefault1 == null) {
orDefault1 = new LinkedHashMap<String, Double>();
}
orDefault1.put(str1, 0.0);
LHMSMXLocal.put(str, orDefault1);
lHMSMX.put(str, orDefault1);
}
});
});
@ -262,12 +273,11 @@ public class MYSQLDatahandler {
Logger.getLogger(MYSQLDatahandler.class.getName()).log(Level.SEVERE, null, ex);
}
LinkedHashMap<String, Double> getFuture = lHMSMX.getOrDefault(SMX.getPrimaryString(), null);
if (getFuture == null) {
getFuture = new LinkedHashMap<String, Double>();
if (getFuture != null) {
getFuture.put(SMX.getSecondaryString(), SMX.getDistance());
lHMSMX.put(SMX.getPrimaryString(), getFuture);
matrixUpdateList.put(matrixUpdateList.size() + 1, SMX);
}
getFuture.put(SMX.getSecondaryString(), SMX.getDistance());
lHMSMX.put(SMX.getPrimaryString(), getFuture);
matrixUpdateList.put(matrixUpdateList.size() + 1, SMX);
}
try {
if (!matrixUpdateList.isEmpty()) {
@ -280,7 +290,7 @@ public class MYSQLDatahandler {
}
}
}
public synchronized void checkIfUpdateStrings() throws CustomError {
if (stopwatch.elapsed(TimeUnit.SECONDS) >= EXPIRE_TIME_IN_SECONDS || !stopwatch.isRunning()) {
ConcurrentMap<Integer, String> str = MessageResponseHandler.getStr();
@ -307,7 +317,7 @@ public class MYSQLDatahandler {
}
}
}
public synchronized String getResponseMsg(String str) throws CustomError {
str = str.trim();
if (str.startsWith("<@")) {
@ -380,7 +390,7 @@ public class MYSQLDatahandler {
+ "\nScore: " + SMXreturn.getDistance());
return SMXreturn.getSecondaryString();
}
public String mostSimilar(String toBeCompared, ConcurrentMap<Integer, String> concurrentStrings) {
int minDistance = 8;
String similar = "";
@ -403,7 +413,7 @@ public class MYSQLDatahandler {
}
return similar;
}
public static ConcurrentMap<Integer, String> cutContent(ConcurrentMap<Integer, String> str) {
ConcurrentMap<Integer, String> returnlist = new MapMaker().concurrencyLevel(2).makeMap();
for (String str1 : str.values()) {
@ -415,7 +425,7 @@ public class MYSQLDatahandler {
}
return returnlist;
}
public static ConcurrentMap<Integer, String> filterContent(ConcurrentMap<Integer, String> str) {
ConcurrentMap<Integer, String> strlistreturn = new MapMaker().concurrencyLevel(2).makeMap();
for (String str1 : str.values()) {
@ -523,7 +533,7 @@ public class MYSQLDatahandler {
}
return strlistreturn;
}
private ConcurrentMap<Integer, String> removeSlacks(ConcurrentMap<Integer, String> str) {
ShiftReduceParser model = getModel();
MaxentTagger tagger = getTagger();
@ -590,7 +600,7 @@ public class MYSQLDatahandler {
}
return strreturn;
}
private ConcurrentMap<Integer, String> verifyCalculationFitness(ConcurrentMap<Integer, String> strmap) {
ConcurrentMap<Integer, String> returnmap = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> allStrings = stringCache;

View File

@ -4,11 +4,15 @@ import FunctionLayer.LevenshteinDistance;
import FunctionLayer.MYSQLDatahandler;
import FunctionLayer.SimilarityMatrix;
import com.google.common.collect.MapMaker;
import edu.mit.jmwe.data.IMWE;
import edu.mit.jmwe.data.IMWEDesc;
import edu.mit.jmwe.data.IToken;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.JMWEAnnotation;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
@ -48,14 +52,16 @@ import org.ejml.simple.SimpleMatrix;
*/
public class SentimentAnalyzerTest implements Callable<SimilarityMatrix> {
private SimilarityMatrix smxParam;
private String str;
private String str1;
private SimilarityMatrix smxParam;
private ShiftReduceParser model;
private MaxentTagger tagger;
private GrammaticalStructureFactory gsf;
private StanfordCoreNLP pipeline;
private StanfordCoreNLP pipelineSentiment;
private StanfordCoreNLP pipelineJMWE;
private AbstractSequenceClassifier classifier;
public SentimentAnalyzerTest(String str, String str1, SimilarityMatrix smxParam) {
@ -66,6 +72,7 @@ public class SentimentAnalyzerTest implements Callable<SimilarityMatrix> {
tagger = MYSQLDatahandler.getTagger();
pipeline = MYSQLDatahandler.getPipeline();
pipelineSentiment = MYSQLDatahandler.getPipelineSentiment();
pipelineJMWE = MYSQLDatahandler.getPipelineJMWE();
gsf = MYSQLDatahandler.getGsf();
classifier = MYSQLDatahandler.getClassifier();
}
@ -231,6 +238,7 @@ public class SentimentAnalyzerTest implements Callable<SimilarityMatrix> {
Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
String partText = sentence.toString();
SimpleMatrix predictions = RNNCoreAnnotations.getPredictions(tree);
if (partText.length() > longest1) {
mainSentiment1 = sentiment;
longest1 = partText.length();
@ -258,6 +266,155 @@ public class SentimentAnalyzerTest implements Callable<SimilarityMatrix> {
score -= (deffLongest - deffshorter) * 50;
}
}
Annotation jmweStrAnnotation = new Annotation(str);
pipelineJMWE.annotate(jmweStrAnnotation);
List<CoreMap> sentences = jmweStrAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
int tokensCounter1 = 0;
int tokensCounter2 = 0;
int anotatorcounter1 = 0;
int anotatorcounter2 = 0;
int inflectedCounterPositive1 = 0;
int inflectedCounterPositive2 = 0;
int inflectedCounterNegative = 0;
int MarkedContinuousCounter1 = 0;
int MarkedContinuousCounter2 = 0;
int UnmarkedPatternCounter = 0;
ConcurrentMap<Integer, String> ITokenMapTag1 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> ITokenMapTag2 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenStems1 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenStems2 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenForm1 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenForm2 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenGetEntry1 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenGetEntry2 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenGetiPart1 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenGetiPart2 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenEntryPOS1 = new MapMaker().concurrencyLevel(2).makeMap();
ConcurrentMap<Integer, String> strTokenEntryPOS2 = new MapMaker().concurrencyLevel(2).makeMap();
for (CoreMap sentence : sentences) {
for (IMWE<IToken> token : sentence.get(JMWEAnnotation.class)) {
if (token.isInflected()) {
inflectedCounterPositive1++;
} else {
inflectedCounterNegative++;
}
strTokenForm1.put(strTokenForm1.size() + 1, token.getForm());
strTokenGetEntry1.put(strTokenGetEntry1.size() + 1, token.getEntry().toString().substring(token.getEntry().toString().length() - 1));
Collection<IMWEDesc.IPart> values = token.getPartMap().values();
IMWEDesc entry = token.getEntry();
MarkedContinuousCounter1 += entry.getMarkedContinuous();
UnmarkedPatternCounter += entry.getUnmarkedPattern();
for (IMWEDesc.IPart iPart : values) {
strTokenGetiPart1.put(strTokenGetiPart1.size() + 1, iPart.getForm());
}
for (String strPostPrefix : entry.getPOS().getPrefixes()) {
strTokenEntryPOS1.put(strTokenEntryPOS1.size() + 1, strPostPrefix);
}
for (IToken tokens : token.getTokens()) {
ITokenMapTag1.put(ITokenMapTag1.size() + 1, tokens.getTag());
for (String strtoken : tokens.getStems()) {
strTokenStems1.put(strTokenStems1.size() + 1, strtoken);
}
}
tokensCounter1++;
}
anotatorcounter1++;
}
jmweStrAnnotation = new Annotation(str1);
pipelineJMWE.annotate(jmweStrAnnotation);
sentences = jmweStrAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
for (IMWE<IToken> token : sentence.get(JMWEAnnotation.class)) {
if (token.isInflected()) {
inflectedCounterPositive2++;
} else {
inflectedCounterNegative--;
}
strTokenForm2.put(strTokenForm2.size() + 1, token.getForm());
strTokenGetEntry2.put(strTokenGetEntry2.size() + 1, token.getEntry().toString().substring(token.getEntry().toString().length() - 1));
Collection<IMWEDesc.IPart> values = token.getPartMap().values();
IMWEDesc entry = token.getEntry();
MarkedContinuousCounter2 += entry.getMarkedContinuous();
UnmarkedPatternCounter += entry.getUnmarkedPattern();
for (IMWEDesc.IPart iPart : values) {
strTokenGetiPart2.put(strTokenGetiPart2.size() + 1, iPart.getForm());
}
for (String strPostPrefix : entry.getPOS().getPrefixes()) {
strTokenEntryPOS2.put(strTokenEntryPOS2.size() + 1, strPostPrefix);
}
for (IToken tokens : token.getTokens()) {
ITokenMapTag2.put(ITokenMapTag2.size() + 1, tokens.getTag());
for (String strtoken : tokens.getStems()) {
strTokenStems2.put(strTokenStems2.size() + 1, strtoken);
}
}
tokensCounter2++;
}
anotatorcounter2++;
}
for (String strTokenPos1 : strTokenEntryPOS1.values()) {
for (String strTokenPos2 : strTokenEntryPOS2.values()) {
if (strTokenPos1.equals(strTokenPos2)) {
score += 500;
}
}
}
score += UnmarkedPatternCounter * 1600;
if (MarkedContinuousCounter1 > 0 && MarkedContinuousCounter2 > 0) {
score += MarkedContinuousCounter1 > MarkedContinuousCounter2 ? (MarkedContinuousCounter1 - MarkedContinuousCounter2) * 500
: (MarkedContinuousCounter2 - MarkedContinuousCounter1) * 500;
}
for (String strTokeniPart1 : strTokenGetiPart1.values()) {
for (String strTokeniPart2 : strTokenGetiPart2.values()) {
if (strTokeniPart1.equals(strTokeniPart2)) {
score += 400;
}
}
}
for (String strTokenEntry1 : strTokenGetEntry1.values()) {
for (String strTokenEntry2 : strTokenGetEntry2.values()) {
if (strTokenEntry1.equals(strTokenEntry2)) {
score += 2500;
}
}
}
for (String strmapTag : ITokenMapTag1.values()) {
for (String strmapTag1 : ITokenMapTag2.values()) {
if (strmapTag.equals(strmapTag1)) {
score += 1450;
}
}
}
for (String strTokenForm1itr1 : strTokenForm1.values()) {
for (String strTokenForm1itr2 : strTokenForm2.values()) {
if (strTokenForm1itr1.equals(strTokenForm1itr2)) {
score += 2600;
} else if (strTokenForm1itr1.contains(strTokenForm1itr2)) {
score += 500;
}
}
}
for (String strTokenStem : strTokenStems1.values()) {
for (String strTokenStem1 : strTokenStems2.values()) {
if (strTokenStem.equals(strTokenStem1)) {
score += 1500;
}
}
}
if (inflectedCounterPositive1 + inflectedCounterPositive2 > inflectedCounterNegative && inflectedCounterNegative > 0) {
score += (inflectedCounterPositive1 - inflectedCounterNegative) * 650;
}
if (inflectedCounterPositive1 > 0 && inflectedCounterPositive2 > 0) {
score += ((inflectedCounterPositive1 + inflectedCounterPositive2) - inflectedCounterNegative) * 550;
}
if (anotatorcounter1 > 1 && anotatorcounter2 > 1) {
score += (anotatorcounter1 + anotatorcounter2) * 400;
}
if (tokensCounter1 > 0 && tokensCounter2 > 0) {
score += (tokensCounter1 + tokensCounter2) * 400;
} else {
score -= tokensCounter1 >= tokensCounter2 ? (tokensCounter1 - tokensCounter2) * 500 : (tokensCounter2 - tokensCounter1) * 500;
}
LevenshteinDistance leven = new LevenshteinDistance(str, str1);
int SentenceScoreDiff = leven.computeLevenshteinDistance();
SentenceScoreDiff *= 15;

View File

@ -46,7 +46,7 @@ public class DiscordHandler {
MYSQLDatahandler.shiftReduceParserInitiate();
MYSQLDatahandler.instance.instantiateExecutor();
if (MYSQLDatahandler.instance.getstringCacheSize() != 0) {
while (MYSQLDatahandler.instance.getlHMSMXSize() * MYSQLDatahandler.instance.getlHMSMXSize() * 1.5
while (MYSQLDatahandler.instance.getlHMSMXSize() * MYSQLDatahandler.instance.getlHMSMXSize() * 2
< (MYSQLDatahandler.instance.getstringCacheSize()
* MYSQLDatahandler.instance.getstringCacheSize())
- MYSQLDatahandler.instance.getstringCacheSize()) {