Solutions des exercices du TP 2
3.1.1.
public SimpleLuceneIndex(String docPath, String inputListFile, String indexPath) throws Throwable {
docIdList = null;
// initialisation de la donnee-membre
setupDocIds(docPath+"/"+inputListFile);
// appel de la methode qui remplit la liste des id des docs a indexer
FSDirectory indexDir = FSDirectory.open (new File(indexPath));
// instanciation objet permettant la lecture/ecriture d'index sur disque
indexWriter = new IndexWriter(indexDir,new StandardAnalyzer(), true);
// instanciation objet gerant l'ecriture effective de l'index
gatherAndIndexDocs(docPath);
// appel de la methode qui fait le travail principal
indexWriter . optimize();
// optimisation de l'index pour permettre une recherche plus efficace
indexWriter . close();
// finalisation et ecriture sur disque de l'index
}
3.1.2
private String readTextFile(String fullPathInputTextFile) throws Throwable {
Scanner scanner = new Scanner(new FileInputStream(fullPathInputTextFile));
StringBuilder text = new StringBuilder();
String NL = System.getProperty("line.separator");
String sepStr = "";
boolean qSetSep = true;
while (scanner.hasNextLine()){
text.append(sepStr + scanner.nextLine());
if(qSetSep) {
qSetSep = false;
sepStr = NL;
}
}
scanner.close();
return text.toString();
}
3.1.3
private void setupDocIds(String fullPathInputListFile) throws Throwable {
docIdList = new ArrayList();
System.out.println("setupListOfDocIds(): Attempting to open " + fullPathInputListFile);
Scanner scanner = new Scanner(new FileInputStream(fullPathInputListFile));
while (scanner.hasNextLine()){
String[] lineToken = scanner.nextLine().split("\\s+");
if(lineToken . length > 1) {
docIdList . add(lineToken[1]);
System.out.println("setupListOfDocIds(): Adding id " + lineToken[1]);
}
}
scanner.close();
}
3.1.4
private void gatherAndIndexDocs(String path) throws Throwable {
for (String docId : docIdList) {
String textFile = path + "/download_" + docId + ".txt";
System.out.println("gatherAndIndexDocs(): Processing doc " + textFile);
String docText = readTextFile(textFile);
String docUrl = readTextFile(path + "/url_" + docId + ".txt");
String docTitle = readTextFile(path + "/title_" + docId + ".txt");
String docOutLk = readTextFile(path + "/outlinks_" + docId + ".txt");
String rankScore = readTextFile(path + "/rankscore_" + docId + ".txt");
Document luceneDoc = new Document();
luceneDoc . add(new Field("content", docText, Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.YES));
luceneDoc . add(new Field("url", docUrl, Field.Store.YES,
Field.Index.NO));
luceneDoc . add(new Field("id", docId, Field.Store.YES,
Field.Index.NO));
luceneDoc . add(new Field("title", docTitle,Field.Store.YES,
Field.Index.NO));
luceneDoc . add(new Field("outlinks", docOutLk,Field.Store.YES,
Field.Index.NO));
luceneDoc . add(new Field("rankscore", rankScore,Field.Store.YES,
Field.Index.NOT_ANALYZED));
indexWriter . addDocument(luceneDoc);
}
}
3.2.
public SimpleLuceneSearcher(String indexPath, String queryString,
String maxHitsString) throws Throwable {
int maxHits = Integer . valueOf(maxHitsString);
IndexSearcher indexSearcher = new IndexSearcher(FSDirectory.open(new File(indexPath)));
TopScoreDocCollector collector = TopScoreDocCollector.create(maxHits, true);
Query luceneQuery =
new QueryParser(searchFieldName, new StandardAnalyzer ()).parse(queryString);
indexSearcher . search(luceneQuery, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
System.out.println("Found " + hits.length + " hits.");
for(int kHit = 0; kHit < hits.length ;++kHit) {
int luceneDocId = hits[kHit].doc;
String luceneScore = String . valueOf(hits[kHit].score);
Document luceneDoc = indexSearcher.doc(luceneDocId);
String docTitle = luceneDoc.get(docTitleFieldName);
String displayEntry = String . valueOf(kHit + 1) + ". " +
luceneDoc.get(docUrlFieldName) + " " +
luceneScore + " " + docTitle;
System.out.println(displayEntry);
}
}