Vous êtes sur la page 1sur 10

package csc415_lucene_Part2; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.

ArrayList; import java.util.HashMap; import java.util.Map; import java.util.StringTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector;

import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.w3c.dom.Element; public class knnService { String startingPoint = "@data"; static int k=1; public void run() throws ParseException, CorruptIndexException, LockObtainFailedException, IOException{ String trainingFile = "questiontrain.arff"; String testFile = "questiontest2.arff"; ArrayList<doc> testDocs = new ArrayList<doc>(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_35); Directory index = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, analyzer); IndexWriter w = new IndexWriter(index, config); readTrainingFile(w, trainingFile); w.commit(); // System.out.println(getClassName(index,analyzer,"i cant get into my yahoo messenger at all i dont know what to do?")); double noOftestDocs = readTestFile(testFile, testDocs); double countSucces=0; double overall=0; for (int i=0; i<testDocs.size(); i++){

if (testDocs.get(i).question!= null){ //System.out.println (testDocs.get(i).question); testDocs.get(i).matchedClass=getClassName(index, analyzer, testDocs.get(i).question); countSucces+=testDocs.get(i).ifMatchedCorrectly(); overall++; } } System.out.println ("Success rate:" +countSucces/ noOftestDocs); } private int readTestFile (String fileName, ArrayList<doc> testDocs)throws IOException { String entireFile = "",strLine, question, className; StringTokenizer st; try{ FileInputStream fstream = new FileInputStream(fileName); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader (in)); int i=0; boolean startStoring = false; while ((strLine = br.readLine())!=null){ if(startStoring){ st = new StringTokenizer(strLine,","); question = st.nextToken().toString(); question = question.substring(1, question.length()-1).replace("?", ""); className = st.nextToken().toString().trim();

doc temp = new doc(); temp.question=question; temp.actualClass = className; testDocs.add(temp); i++; } if (strLine.equalsIgnoreCase(startingPoint)) startStoring=true; } System.out.println ("Completed reading " +i+ " training documents"); return i; }catch(Exception e){ System.out.println("Error reading in file: "+e.getMessage()); return 0; } } private static String getClassName( Directory index, StandardAnalyzer analyzer, String intialQuery ) throws IOException, ParseException { StringTokenizer st = new StringTokenizer(intialQuery," ?"); String queryString="", returnString=""; System.out.println (st.countTokens()); Map<String, Integer> hm = new HashMap<String, Integer>(); while(st.hasMoreTokens()){

String temp = st.nextToken().toString(); if(temp!=null && !isStopWord(temp)){ queryString+= "question: \"" + temp +"\""; System.out.println(getClassName2(index, analyzer,queryString)); String tempClassName = getClassName2(index, analyzer,queryString); if (hm.containsKey(tempClassName)) { Integer val = hm.get(tempClassName); val = val + 1; hm.put(tempClassName, val); } else hm.put(tempClassName, 1); } } int i=-1; for (Map.Entry<String,Integer> e : hm.entrySet()){ if (e.getValue()>i){ i=e.getValue(); returnString = e.getKey(); } } return returnString;

private static String getClassName2( Directory index, StandardAnalyzer analyzer, String queryString ) throws IOException, ParseException { int hitsPerPage = k; Query q = new QueryParser(Version.LUCENE_35, "", analyzer).parse(queryString); ScoreDoc[] hits = null; Map<String, Integer> hm = new HashMap<String, Integer>(); IndexSearcher searcher = null;IndexReader reader; try { reader = IndexReader.open(index); searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); hits = collector.topDocs().scoreDocs; //System.out.println(getClassName(hits, searcher, hm)); searcher.close(); } catch (IOException e) { System.err.println("Sorry. Error encountered while trying to process index:"); System.err.println(e.getMessage()); }

String returnString=""; for (int i = 0; i < hits.length; ++i) {

int docId = hits[i].doc; Document d = searcher.doc(docId); if (hm.containsKey(d.get("class"))) { Integer val = hm.get(d.get("class")); val = val + 1; hm.put(d.get("class"), val); } else {//else just add it hm.put(d.get("class"), 1); } } int i=-1; for (Map.Entry<String,Integer> e : hm.entrySet()){ if (e.getValue()>i){ i=e.getValue(); returnString = e.getKey(); } } return returnString; } public void readTrainingFile(IndexWriter w, String fileName)throws IOException { String entireFile = "",strLine, question, className;

StringTokenizer st; try{ FileInputStream fstream = new FileInputStream(fileName); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader (in)); int i=0; boolean startStoring = false; while ((strLine = br.readLine())!=null){ if(startStoring){ st = new StringTokenizer(strLine,","); question = st.nextToken().toString(); question = question.substring(1, question.length()-1).replace("?", ""); className = st.nextToken().toString().trim(); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(new Field("question", question, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("class", className, Field.Store.YES, Field.Index.ANALYZED)); w.addDocument(doc); i++; } if (strLine.equalsIgnoreCase(startingPoint)) startStoring=true; } System.out.println ("Completed reading " +i+ " test documents"); }catch(Exception e){ System.out.println("Error reading in file: "+e.getMessage()); } }

private final static String[] ENGLISH_STOP_WORDS ={ "a","about","above","after","again","against","all","am","an","and","any", "are","aren't","as","at","be","because","been","before","being","below", "between","both","but","by","can", "cant", "can't","cannot","could","couldn't","did", "didn't","do","does","doesn't","doing","don't","down","during","each", "few","for","from","further","had","hadn't","has","hasn't","have","haven't", "having","he","he'd","he'll","he's","her","here","here's","hers","herself", "him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in", "into","is","isn't","it","it's","its","itself","let's","me","more","most", "mustn't","my","myself","no","nor","not","of","off","on","once","only","or", "other","ought","our","ours","ourselves","out","over","own","same","shan't", "she","she'd","she'll","she's","should","shouldn't","so","some","such","than", "that","that's","the","their","theirs","them","themselves","then","there","there's", "these","they","they'd","they'll","they're","they've","this","those","through","to", "too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've", "were","weren't","what","what's","when","when's","where","where's","which","while", "who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd", "you'll","you're","you've","your","yours","yourself","yourselves", ""}; public static boolean isStopWord(String input){ for (int i=0; i<ENGLISH_STOP_WORDS.length; i++){ if (input.toLowerCase().equals(ENGLISH_STOP_WORDS[i])) return true; }

return false; } }

Vous aimerez peut-être aussi