/*
* Copyright (C) 2009 by TunedIT
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package rsctc2010;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipInputStream;
import org.debellor.base.evaluator.score.Score;
import org.debellor.core.data.SymbolicFeature;
import org.debellor.core.exception.data.DataException;
import org.debellor.core.util.Permute;
import org.tunedit.core.EvaluationProcedure;
import org.tunedit.core.ResourceLoader;
import org.tunedit.core.ResourceName;
import org.tunedit.core.exception.AlgorithmErrorException;
import org.tunedit.core.exception.EvaluationSetupException;
import org.tunedit.core.exception.TunedTesterException;
/**
* Evaluation procedure that takes two ZIP files with entry files that contain decisions.
* One of the ZIP files contain predicted decisions, and another one - ground truth decisions.
* Iterates through entry files and counts balanced accuracy of predictions.
* All entries in the ground-truth ZIP must be matched by corresponding entries
* in the predictions ZIP having the same names.
* We assume that the file sizes are small and we can load all their contents into memory.
*
* @author Marcin Wojnarski
*
*/
public class EvalDecisions extends EvaluationProcedure {
/** All non-determinism in the evaluation procedure will come
* from this deterministically-initiated random number generator.
* So, this "non-determinism" will be the same in every run of the procedure,
* if only the dataset with target decisions is the same. */
private Random random = new Random(961);
/**
* @param predictionsName - resource name of the file with predictions supplied by an algorithm
* @param targetsName - resource name of the file with true decisions
*
* @see org.tunedit.core.EvaluationProcedure#run(org.tunedit.core.ResourceName, org.tunedit.core.ResourceName, org.tunedit.core.ResourceLoader)
*/
public Double[] run(ResourceName predictionsName, ResourceName targetsName, ResourceLoader loader)
throws TunedTesterException, EvaluationSetupException, AlgorithmErrorException
{
if(!predictionsName.isFile())
throw new AlgorithmErrorException("Incorrect resource name: " + predictionsName + ". Expected file resource");
if(!targetsName.isFile())
throw new EvaluationSetupException("Incorrect resource name: " + targetsName + ". Expected file resource");
ZipInputStream targetsZip = new ZipInputStream(loader.open(targetsName));
ZipInputStream predictionsZip = new ZipInputStream(loader.open(predictionsName));
try {
Map<String, ArrayList<String>> targets, predictions;
try {
targets = loadContents(targetsZip);
}
catch(ZipException e) { throw new EvaluationSetupException("File with target decisions is not a correct ZIP file"); }
try {
predictions = loadContents(predictionsZip);
}
catch(ZipException e) { throw new AlgorithmErrorException("File with predicted decisions is not a correct ZIP file"); }
boolean isFinal = targetsName.toString().endsWith("final.zip");
double res = compare(targets, predictions, isFinal);
predictionsZip.close();
targetsZip.close();
return new Double[] { res };
}
catch(IOException e) {
throw new TunedTesterException(e);
}
}
private double compare(
Map<String, ArrayList<String>> targets,
Map<String, ArrayList<String>> predictions,
boolean isFinal) throws AlgorithmErrorException
{
double sumResult = 0.0;
for(String file : new TreeSet<String>(targets.keySet())) { // TreeSet sorts keys alphabetically
ArrayList<String> t = targets.get(file);
ArrayList<String> p = predictions.get(file);
if(p == null) throw new AlgorithmErrorException(
"ZIP file with predicted decisions doesn't contain a file: " + file);
if(p.size() < t.size()) throw new AlgorithmErrorException(
"File " + file + " in the ZIP with predicted decisions contains too few lines: "
+ p.size() + " instead of " + t.size());
Score score = new BalancedAccuracy();
boolean[] mask = getMask(t.size(), isFinal);
try {
for(int i = 0; i < t.size(); i++)
if(mask[i])
score.add(new SymbolicFeature(t.get(i)), new SymbolicFeature(p.get(i)));
}
catch (DataException e) { e.printStackTrace(); }
sumResult += score.result();
}
return sumResult / targets.size();
}
private boolean[] getMask(int len, boolean isFinal)
{
// Indices of objects that will be used in final testing (isFinal = true)
// The rest will be used in preliminary testing
int[] indices = Permute.indices(len, len / 2, random);
boolean[] mask = new boolean[len];
Arrays.fill(mask, !isFinal);
for(int ind : indices)
mask[ind] = isFinal;
System.out.print("Will evaluate on samples number:");
for(int i = 0; i < len; i++)
if(mask[i]) System.out.print(" " + i);
System.out.println();
return mask;
}
private static Map<String, ArrayList<String>> loadContents(ZipInputStream zip) throws IOException
{
Map<String, ArrayList<String>> map = new HashMap<String, ArrayList<String>>();
ZipEntry entry;
while((entry = zip.getNextEntry()) != null) {
System.out.println("Will load contents of: " + entry.getName());
map.put(entry.getName(), loadLines(zip));
}
return map;
}
private static ArrayList<String> loadLines(InputStream fileStream) throws IOException
{
BufferedReader reader = new BufferedReader(new InputStreamReader(fileStream));
ArrayList<String> lines = new ArrayList<String>();
String line;
while((line = reader.readLine()) != null)
lines.add(line.trim()); // leading and trailing whitespaces are omitted
return lines;
}
}