%last updated: 11/01/2010
%Copyright: Kevin Bleakley 2010.

%Purpose: script to do drug-target interaction prediction. Uses 10 fold cross-validation.

%Requires: the libsvm package. [GIVE URL!]

%The actual datasets we used in the correct format are downloadable 
%from the same page you got this file. So you can see them run before trying to use your own data.


%Output: gives two continuously-valued scores for each putative drug-target interaction. Also,
         %we calculate ROC curves and Area under them (AUC).

%General remarks:
%1) The code should work on similarity matrices also, and not just positive semi-definite
%   kernel matrices.
%2) You have to have pre-calculated your similarity/kernel matrices in advance.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%here we load the dataset(s) and, as they are textfiles converted from a format needing to be read by
%"R", they have one extra column to be removed before proceeding. You have to comment or uncomment the
%data set that you want to use.

% %1) ENZYMES
% %adjacency matrix:
% load e_admat_dgc2.txt
% y = e_admat_dgc2(:,2:(size(e_admat_dgc2,2)));
% %compound similarity matrix:
% load e_simmat_dc2.txt
% kCompound = e_simmat_dc2(:,2:(size(e_simmat_dc2,2)));
% %target similarity matrix:
% load e_simmat_dg2.txt
% kTarget = e_simmat_dg2(:,2:(size(e_simmat_dg2,2)));

% %2) ION CHANNELS
% %adjacency matrix:
% load ic_admat_dgc2.txt
% y = ic_admat_dgc2(:,2:(size(ic_admat_dgc2,2)));
% %compound similarity matrix:
% load ic_simmat_dc2.txt
% kCompound = ic_simmat_dc2(:,2:(size(ic_simmat_dc2,2)));
% %target similarity matrix:
% load ic_simmat_dg2.txt
% kTarget = ic_simmat_dg2(:,2:(size(ic_simmat_dg2,2)));

% %3)GPCRs
% %adjacency matrix:
% load gpcr_admat_dgc2.txt
% y = gpcr_admat_dgc2(:,2:(size(gpcr_admat_dgc2,2)));
% %compound similarity matrix:
% load gpcr_simmat_dc2.txt
% kCompound = gpcr_simmat_dc2(:,2:(size(gpcr_simmat_dc2,2)));
% %target similarity matrix:
% load gpcr_simmat_dg2.txt
% kTarget = gpcr_simmat_dg2(:,2:(size(gpcr_simmat_dg2,2)));

%4)NUCLEAR RECEPTORS
%adjacency matrix:
load nr_admat_dgc2.txt
y = nr_admat_dgc2(:,2:(size(nr_admat_dgc2,2)));
%compound similarity matrix:
load nr_simmat_dc2.txt
kCompound = nr_simmat_dc2(:,2:(size(nr_simmat_dc2,2)));
%target similarity matrix:
load nr_simmat_dg2.txt
kTarget = nr_simmat_dg2(:,2:(size(nr_simmat_dg2,2)));

%We had problems with kCompound having imaginary eigenvalues. Here, we symmetrize kCompound
%before continuing:
kCompound = (kCompound + kCompound')/2;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Checking for positive semi-definite. This is because, at present, these are "similarity" matrices rather
%than "kernel" matrices. For now, we just add increments of a small epsilon to the diagonal, until the
%matrix becomes positive semi-definite:
epsilon = .1;
while sum(eig(kCompound) >= 0) < size(kCompound,1) | isreal(eig(kCompound))==0 
    kCompound = kCompound + epsilon*eye(size(kCompound,1));
end
while sum(eig(kTarget) >= 0) < size(kTarget,1) | isreal(eig(kTarget))==0 
    kTarget = kTarget + epsilon*eye(size(kTarget,1));
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%intialise some variables:
lengthKCompound = size(kCompound,1); 
myPredictions = zeros(lengthKCompound,size(y,1));
lengthKTarget = size(kTarget,1); 
myPredictions2 = zeros(lengthKTarget,size(y,2));
numbEdges1 = zeros(lengthKCompound,size(y,1));
numbEdges2 = zeros(lengthKTarget,size(y,2));


%randomly permute the order of the drugs and compounds.:
firstRP = randperm(size(kCompound,1));
secondRP = randperm(size(kTarget,1));
kCompound = kCompound(firstRP,firstRP);
kTarget = kTarget(secondRP,secondRP);
y = y(secondRP,firstRP);

%and we have to save the 'inverse' of these permutations:
[myback1 mb1] = sort(firstRP);
[myback2 mb2] = sort(secondRP);

%define the number of cross-validation splits:
numOfSplits = 10;

theseIndices = zeros(numOfSplits,2);
thebasic = floor(size(y,1)/numOfSplits);
theseIndices(1,1) = 1;
for gg = 1:(numOfSplits-1)
    theseIndices(gg,2) = thebasic*gg;
    theseIndices(gg+1,1) = thebasic*gg + 1;
end
theseIndices(numOfSplits,2) = size(y,1);


theseIndices2 = zeros(numOfSplits,2);
thebasic2 = floor(size(y,2)/numOfSplits);
theseIndices2(1,1) = 1;
for gg = 1:(numOfSplits-1)
    theseIndices2(gg,2) = thebasic2*gg;
    theseIndices2(gg+1,1) = thebasic2*gg + 1;
end
theseIndices2(numOfSplits,2) = size(y,2);


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Fixing each target in turn, then doing ten fold cross validation 
%on the set of compounds, to predict which ones
%are/are not targeting the target in question:
for i=1:size(y,1);
    i
    pause(.3)
    currentY = y(i,:)';
    for j = 1:lengthKCompound
        %%%!%%%
        whatSplit = sum(j >= theseIndices2(:,1));
        %%%!%%%
        %j;
        if sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1) > 0
            
            %these are the weights for positive and negative examples. They alter the behavior of libsvm.
            m2a = 1;
            m2b = 1;

            %defining the train and test matrices that will be input into libsvm.
            %%%!%%%
            K1 = [[1:(lengthKCompound-(theseIndices2(whatSplit,2)-theseIndices2(whatSplit,1)+1))]' kCompound(setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]),setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))];
            K2 = [(j)' kCompound(j,setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))];
            %%%!%%%
            
            %training:
            model = svmtrain(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))),K1,strcat(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)]));
            %testing:
            [predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model);
            %%!%%
            %this line is to fix a weakness in libsvm, the first training label is always automatically
            %labeled +1, even if it's -1 !
            firstLabel = currentY(1)*(j>theseIndices2(1,2)) + currentY(theseIndices2(2,1))*(j<=theseIndices2(1,2));
            %%!%%
            myPredictions(j,i) = dec_values*sign(firstLabel - 1/2);
            numbEdges1(j,i) = sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1);
        else
            myPredictions(j,i) = -1; 
            numbEdges1(j,i) = 0;
        end
    end
end

%un-permuting the row and column orders:
myPredictions = myPredictions(mb1,mb2);
ytemp = y';
ytemp = ytemp(mb1,mb2);
ytemp = ytemp';

%Turning myPredictions and "y" into vectors:
myScores =[];
myTrueLabels = [];
for k = 1:lengthKCompound
    myScores = [myScores myPredictions(k,:)];
    myTrueLabels = [myTrueLabels ytemp(:,k)'];
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Fixing each compound in turn, then doing ten fold cross validation 
%on the set of targets, to predict which ones
%are/are not targeted by the compound in question:
for i= 1:size(y,2);
    i;
    currentY = y(:,i);
    for j = 1:lengthKTarget
        %%%!%%%
        whatSplit = sum(j >= theseIndices(:,1));
        %%%!%%%
        if sum(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))) == 1) > 0

              %these are the weights for positive and negative examples. They alter the behavior of
              %libsvm.
              m2a = 1;
              m2b = 1;
          %matrices to input to libsvm:
          K1 = [[1:(lengthKTarget-(theseIndices(whatSplit,2)-theseIndices(whatSplit,1)+1))]' kTarget(setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]),setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))];
          K2 = [(j)' kTarget(j,setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))];
          %training:
          model = svmtrain(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))),K1,strcat(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)]));
          %testing:
          [predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model);
          %fixing libsvms desire to label -1 examples as +1 examples if they are the first item in the
          %training set.
          firstLabel = currentY(1)*(j>theseIndices(1,2)) + currentY(theseIndices(2,1))*(j<=theseIndices(1,2));
          myPredictions2(j,i) = dec_values*sign(firstLabel - 1/2); 
          numbEdges2(j,i) = sum(currentY(setdiff(1:lengthKTarget,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1);

        else
            myPredictions2(j,i) = -1;
            numbEdges2(j,i) = 0;
        end
    end
end

%un-permuting the row and column orders:
myPredictions2 = myPredictions2';
myPredictions2 = myPredictions2(mb1,mb2);
myPredictions2 = myPredictions2';
y = y(mb2,mb1);

%Turning myPredictions and "y" into vectors:
myScores2 =[];
myTrueLabels2 = [];
for k = 1:lengthKTarget
    myScores2 = [myScores2 myPredictions2(k,:)];
    myTrueLabels2 = [myTrueLabels2 y(k,:)];
end

%and again, but this time by column, (for later):
myScores2b =[];
myTrueLabels2b = [];
for k = 1:lengthKCompound
    myScores2b = [myScores2b myPredictions2(:,k)'];
    myTrueLabels2b = [myTrueLabels2b y(:,k)'];
end

 
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%AREA UNDER ROC CURVE for fixing targets in turn:
myNumb = length((-2.5):.001:(2.5));

TruPos = zeros(1,myNumb);
FalPos = zeros(1,myNumb);

countMan = 0;
for moveit = (-2.5):.001:(2.5)
        countMan = countMan + 1;
        TruPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==1);
        FalPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==0);
end

plot(FalPos/max(FalPos),TruPos/max(TruPos),'r');
hold;

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %calculating the area under the ROC curve:
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

xx = FalPos/max(FalPos);
yy = TruPos/max(TruPos);

areaUnderROC = 0;

old = [0 0];

for bb = 1:myNumb
    new = [xx(bb) yy(bb)];
   
    if new(1) == old(1) & new(2) > old(2)
        old = new;
       
    elseif new(1) > old(1) & new(2) > old(2)
        areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1));
        old = new;
       
    elseif new(1) > old(1) & new(2) == old(2)
        areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1));
        old = new;
       
    end
end

areaUnderROC


% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %AREA UNDER ROC CURVE for fixing compounds in turn:
myNumb = length((-2.5):.001:(2.5));

TruPos2 = zeros(1,myNumb);
FalPos2 = zeros(1,myNumb);

countMan = 0;
for moveit = (-2.5):.001:(2.5)
        countMan = countMan + 1;
        TruPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==1);
        FalPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==0);
end

plot(FalPos2/max(FalPos2),TruPos2/max(TruPos2),'b');

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %calculating the area under the ROC curve:
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

xx = FalPos2/max(FalPos2);
yy = TruPos2/max(TruPos2);

areaUnderROC = 0;

old = [0 0];

for bb = 1:myNumb
    new = [xx(bb) yy(bb)];
   
    if new(1) == old(1) & new(2) > old(2)
        old = new;
       
    elseif new(1) > old(1) & new(2) > old(2)
        areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1));
        old = new;
       
    elseif new(1) > old(1) & new(2) == old(2)
        areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1));
        old = new;
       
    end
end

areaUnderROC

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %AREA UNDER ROC CURVE for both situations added together:

%various ways of combining the two predictions made for each edge:
%1) adding them:
%myPredictions3 = myPredictions + myPredictions2';
%2) taking the largest:
myPredictions3 = max(myPredictions,myPredictions2');

myScores3 =[];
myTrueLabels3 = [];
for k = 1:lengthKCompound
    myScores3 = [myScores3 myPredictions3(k,:)];
    myTrueLabels3 = [myTrueLabels3 y(:,k)'];
end

myNumb = length((-5):.001:(5));

TruPos3 = zeros(1,myNumb);
FalPos3 = zeros(1,myNumb);

countMan = 0;
for moveit = (-5):.001:(5)
        countMan = countMan + 1;
        TruPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==1);
        FalPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==0);
end

plot(FalPos3/max(FalPos3),TruPos3/max(TruPos3),'g');
hold off;

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %calculating the area under the ROC curve:
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

xx3 = FalPos3/max(FalPos3);
yy3 = TruPos3/max(TruPos3);

areaUnderROC3 = 0;

old3 = [0 0];

for bb = 1:length(xx3)
    new3 = [xx3(bb) yy3(bb)];
   
    if new3(1) == old3(1) & new3(2) > old3(2)
        old3 = new3;
       
    elseif new3(1) > old3(1) & new3(2) > old3(2)
        areaUnderROC3 = areaUnderROC3 + ((old3(2) + new3(2))/2) * (new3(1) - old3(1));
        old3 = new3;
       
    elseif new3(1) > old3(1) & new3(2) == old3(2)
        areaUnderROC3 = areaUnderROC3 + old3(2) * (new3(1) - old3(1));
        old3 = new3;
       
    end
end

areaUnderROC3

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%here we calculate area under precision-recall curve, only for the 3rd "max" data set.
%by copying and pasting and changing all the '3' to '1' or '2', 
%you can do this for the TruePos1 etc., and TruePos2 etc.


%FDR curves and/or precision-recall curves:
xFDRPR = TruPos3/max(TruPos3);
yFDR = FalPos3./(FalPos3 + TruPos3);
yPR = 1 - yFDR;
plot(xFDRPR,yFDR);
plot(xFDRPR,yPR);

%area under precision accuracy curve: (actually it's a code for area under the FDR curve, so
%in the last line we do "1 - result".
            xxPR = TruPos3/max(TruPos3);
            yyPR = FalPos3./(FalPos3 + TruPos3 + .000000001*(FalPos3==0 & TruPos3==0));
            xRun = 0;
            addToSum = 0;
            myXa = 0;
                for myCounta = 1:myNumb
                    if xxPR(myCounta) > xRun & myCounta > 1
                        myXb = xxPR(myCounta);
                        addToSum = addToSum + (myXb - myXa)*(yyPR(myCounta - 1) + yyPR(myCounta))/2;
                        myXa = myXb;
                        xRun = xxPR(myCounta);
                    elseif xxPR(myCounta) > xRun & myCounta == 1
                        myXa = xxPR(1);
                        xRun = xxPR(myCounta);
                    end
                end
            AUPR = 1 - addToSum
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%