cacc.m

function [ discdata,discvalues,discscheme ] = cacc(data)
    
    discdata = zeros(size(data));
    discvalues = cell([size(data,2) 1]);
    discscheme = cell([size(data,2)-1 1]);
    
    % s is the number of target classes of S (possible class values)
    %%% unique values in all rows of 5th column
    s = unique(data(:,size(data,2)));
    
    % assume the maximum number of intervals is M*0.75 (there is no point in 
    % using an algorithm to discretize a dataset if the resulting discretized 
    % dataset contains a lot of possible values/intervals)
    maxnumintervals = floor(size(data,1)*0.75);
    
    % following the pseudo-code of the paper:
    % for each continuous attribute Ai
    for A = 1:size(data,2)-1 % (-1 because we are not discretizing the class)
        
        % find the maximum dn and the minimum d0 values of Ai
        d0 = min(data(:,A));
        dn = max(data(:,A));
        
        % form a set of all distinct values of A in ascending order
        distincvaluesA = sort(unique(data(:,A)));
        
        % calculate the midpoints of all the adjacent pairs in the set
        B = (distincvaluesA(1:length(distincvaluesA)-1)+distincvaluesA(2:length(distincvaluesA)))/2;
        
        % set the initial discretization scheme as D: {[d0,dn]} and globalcacc = 0;
        D = [d0 dn];
        globalcacc = 0;
        
        % initialize k = 1 (well... = 0 in this code), this is for helping 
        % the algorithm to stop once we have reached the maximum number of 
        % intervals
        k = 0;
        
        %for each inner boundary B which is not already in scheme D, Add it into D
        %   calculate the corresponding cacc value
        %   pick up the scheme D' with the highest cacc value:        
        for i=1:length(B)
            auxB = B;
            maxcacc = 0;
            while length(auxB) > 0
                if(auxB(1) == d0)                    
                    auxB(1) = [];
                    continue;
                end
                % add the boundary B which is not already in scheme D
                D = unique(sort([D, auxB(1)]));
                
                % calculate cacc value s={1 2 3}
                caccval = caccvalue(data(:,A), D, s, data(:,size(data,2)));
                
                % print acc value
                %caccval 
                
                % pick up the scheme D' with the highest cacc value
                if(caccval > maxcacc)
                    Dprime = D;
                    maxcacc = caccval;
                    toremove = auxB(1);
                end
                % remove the boundary B (since we already tried with it)
                D=D(D~=auxB(1));
                auxB(1) = [];
            end
            
            % if cacc > globalcacc
            %    replace D with D', globalcacc = cacc:
            if maxcacc > globalcacc
                B = B(B~=toremove);
                D = Dprime;
                globalcacc = maxcacc;
                k = k + 1;
                if  k > maxnumintervals % if we have reached the maximun number 
                    break;              % of intervals we stop and continue with 
                end                     % the next attribute Ai+1
            end
        end
        
        % output the discretization scheme D' and discretized 
        % data with k intervals for continuous attribute Ai
        discscheme{A} = D;
        discdata(:,A) = discretizedata(data(:,A),D);
        discvalues{A} = unique(discdata(:,A));
    end
    discdata(:,size(data,2)) = data(:,size(data,2));
    discvalues{size(data,2)} = unique(discdata(:,size(data,2)));
end
function caccval = caccvalue(data, discscheme, s, c)
    M = size(data,1);
    
    % discretize the continuous data and compute the quanta matrix:
    
    % I decided not to call the 'discretizedata' function (at the end of this file)
    % and then compute the quanta matrix since that would require two for-
    % loops iterating over each instance on the dataset each one (one for 
    % discretizing the data in the 'discretizedata' function and another 
    % one for computing the quanta matrix).
    % I discovered that I could use only one for-loop to discretize the data and compute
    % the quanta matrix at the same time as follows:    
    discretizeddata = zeros(size(data,1),1);
    quantamatrix = zeros(length(s),length(discscheme)-1);
    for i = 1:size(data,1)
        for t = 2:length(discscheme)
           if data(i) <= discscheme(t)
               discretizeddata(i) = t-1; % discretize data
               break;
           end
        end
        % compute quanta matrix
        quantamatrix(c(i),discretizeddata(i)) = quantamatrix(c(i),discretizeddata(i)) + 1;
    end
    
    % compute y value by using the quanta matrix:
    y = 0;
    rowquantamatrix = sum(quantamatrix,2);
    columnquantamatrix = sum(quantamatrix,1);    
    for p = 1:length(s)
        for q = 1:length(discscheme)-1
           if rowquantamatrix(p) > 0 && columnquantamatrix(q) > 0
              y = y + (quantamatrix(p,q))^2 / (rowquantamatrix(p)*columnquantamatrix(q));
           end
        end
    end
    
    % compute y' value from y value:
    yprime = M*(y-1)/log(length(discscheme)-1);
    
    % compute cacc value from y' value:
    caccval = sqrt(yprime/(yprime+M));
end
function discdata = discretizedata(data, discscheme) 

    % discretize the continuous data upon the 
    % discretization scheme, 
    % this function is called at the end of the cacc algorithm, when we
    % already have the 'final' discretization scheme
    discdata = zeros(size(data,1),1);
    for p = 1:size(data,1)
        for t = 2:length(discscheme)
           if data(p) <= discscheme(t)
               discdata(p) = t-1;
               break;
           end
        end
    end
      
    % normalize discrete data
    % i.e., if the discretized data 
    % for attribute Ai is e.g.: 1,9,13,15, 
    % 'normalize' these discrete values as 1,2,3,4:
    % (you can comment the next 4 lines of code if you
    % don't want this functionality):
    %normvalues = sort(unique(discdata));
    %for i = 1:length(normvalues)
    %    discdata(find(discdata==normvalues(i))) = i;
    %end
end