machine-learning Getting started: Loading a dataset from file


Example

The Iris flower data set is a widely used data set for demonstration purposes. We will load it, inspect it and slightly modify it for later use.

import java.io.File;
import java.net.URL;
import weka.core.Instances;
import weka.core.converters.ArffSaver;
import weka.core.converters.CSVLoader;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.RenameAttribute;
import weka.classifiers.evaluation.Evaluation;
import weka.classifiers.rules.ZeroR;
import weka.classifiers.bayes.NaiveBayes;
import weka.classifiers.lazy.IBk;
import weka.classifiers.trees.J48;
import weka.classifiers.meta.AdaBoostM1;

public class IrisExperiments {
    public static void main(String args[]) throws Exception
     {
        //First we open stream to a data set as provided on http://archive.ics.uci.edu
         CSVLoader loader = new CSVLoader();
         loader.setSource(new URL("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data").openStream());
         Instances data = loader.getDataSet();
         
         //This file has 149 examples with 5 attributes
         //In order:
         //  sepal length in cm
         //  sepal width in cm
         //  petal length in cm 
         //  petal width in cm
         //  class ( Iris Setosa , Iris Versicolour, Iris Virginica)

         //Let's briefly inspect the data
         System.out.println("This file has " + data.numInstances()+" examples.");
         System.out.println("The first example looks like this: ");
         for(int i = 0; i < data.instance(0).numAttributes();i++ ){
             System.out.println(data.instance(0).attribute(i));
         } 
         
         // NOTE that the last attribute is Nominal
         // It is convention to have a nominal variable at the last index as target variable
         
         // Let's tidy up the data a little bit
         // Nothing too serious just to show how we can manipulate the data with filters
         RenameAttribute renamer = new RenameAttribute();
         renamer.setOptions(weka.core.Utils.splitOptions("-R last -replace Iris-type"));
         renamer.setInputFormat(data);                          
         data = Filter.useFilter(data, renamer); 
         
         System.out.println("We changed the name of the target class.");
         System.out.println("And now it looks like this:");
         System.out.println(data.instance(0).attribute(4));
         
         //Now we do this for all the attributes
         renamer.setOptions(weka.core.Utils.splitOptions("-R 1 -replace sepal-length"));
         renamer.setInputFormat(data);                          
         data = Filter.useFilter(data, renamer); 
         
         renamer.setOptions(weka.core.Utils.splitOptions("-R 2 -replace sepal-width"));
         renamer.setInputFormat(data);                          
         data = Filter.useFilter(data, renamer); 
         
         renamer.setOptions(weka.core.Utils.splitOptions("-R 3 -replace petal-length"));
         renamer.setInputFormat(data);                          
         data = Filter.useFilter(data, renamer); 
         
         renamer.setOptions(weka.core.Utils.splitOptions("-R 4 -replace petal-width"));
         renamer.setInputFormat(data);                          
         data = Filter.useFilter(data, renamer); 
         
         //Lastly we save our newly created file to disk
         ArffSaver saver = new ArffSaver();
         saver.setInstances(data);
         saver.setFile(new File("IrisSet.arff"));
         saver.writeBatch();
  }
}