%last updated: 11/01/2010 %Copyright: Kevin Bleakley 2010. %Purpose: script to do drug-target interaction prediction. Uses 10 fold cross-validation. %Requires: the libsvm package. [GIVE URL!] %The actual datasets we used in the correct format are downloadable %from the same page you got this file. So you can see them run before trying to use your own data. %Output: gives two continuously-valued scores for each putative drug-target interaction. Also, %we calculate ROC curves and Area under them (AUC). %General remarks: %1) The code should work on similarity matrices also, and not just positive semi-definite % kernel matrices. %2) You have to have pre-calculated your similarity/kernel matrices in advance. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %here we load the dataset(s) and, as they are textfiles converted from a format needing to be read by %"R", they have one extra column to be removed before proceeding. You have to comment or uncomment the %data set that you want to use. % %1) ENZYMES % %adjacency matrix: % load e_admat_dgc2.txt % y = e_admat_dgc2(:,2:(size(e_admat_dgc2,2))); % %compound similarity matrix: % load e_simmat_dc2.txt % kCompound = e_simmat_dc2(:,2:(size(e_simmat_dc2,2))); % %target similarity matrix: % load e_simmat_dg2.txt % kTarget = e_simmat_dg2(:,2:(size(e_simmat_dg2,2))); % %2) ION CHANNELS % %adjacency matrix: % load ic_admat_dgc2.txt % y = ic_admat_dgc2(:,2:(size(ic_admat_dgc2,2))); % %compound similarity matrix: % load ic_simmat_dc2.txt % kCompound = ic_simmat_dc2(:,2:(size(ic_simmat_dc2,2))); % %target similarity matrix: % load ic_simmat_dg2.txt % kTarget = ic_simmat_dg2(:,2:(size(ic_simmat_dg2,2))); % %3)GPCRs % %adjacency matrix: % load gpcr_admat_dgc2.txt % y = gpcr_admat_dgc2(:,2:(size(gpcr_admat_dgc2,2))); % %compound similarity matrix: % load gpcr_simmat_dc2.txt % kCompound = gpcr_simmat_dc2(:,2:(size(gpcr_simmat_dc2,2))); % %target similarity matrix: % load gpcr_simmat_dg2.txt % kTarget = gpcr_simmat_dg2(:,2:(size(gpcr_simmat_dg2,2))); %4)NUCLEAR RECEPTORS %adjacency matrix: load nr_admat_dgc2.txt y = nr_admat_dgc2(:,2:(size(nr_admat_dgc2,2))); %compound similarity matrix: load nr_simmat_dc2.txt kCompound = nr_simmat_dc2(:,2:(size(nr_simmat_dc2,2))); %target similarity matrix: load nr_simmat_dg2.txt kTarget = nr_simmat_dg2(:,2:(size(nr_simmat_dg2,2))); %We had problems with kCompound having imaginary eigenvalues. Here, we symmetrize kCompound %before continuing: kCompound = (kCompound + kCompound')/2; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Checking for positive semi-definite. This is because, at present, these are "similarity" matrices rather %than "kernel" matrices. For now, we just add increments of a small epsilon to the diagonal, until the %matrix becomes positive semi-definite: epsilon = .1; while sum(eig(kCompound) >= 0) < size(kCompound,1) | isreal(eig(kCompound))==0 kCompound = kCompound + epsilon*eye(size(kCompound,1)); end while sum(eig(kTarget) >= 0) < size(kTarget,1) | isreal(eig(kTarget))==0 kTarget = kTarget + epsilon*eye(size(kTarget,1)); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %intialise some variables: lengthKCompound = size(kCompound,1); myPredictions = zeros(lengthKCompound,size(y,1)); lengthKTarget = size(kTarget,1); myPredictions2 = zeros(lengthKTarget,size(y,2)); numbEdges1 = zeros(lengthKCompound,size(y,1)); numbEdges2 = zeros(lengthKTarget,size(y,2)); %randomly permute the order of the drugs and compounds.: firstRP = randperm(size(kCompound,1)); secondRP = randperm(size(kTarget,1)); kCompound = kCompound(firstRP,firstRP); kTarget = kTarget(secondRP,secondRP); y = y(secondRP,firstRP); %and we have to save the 'inverse' of these permutations: [myback1 mb1] = sort(firstRP); [myback2 mb2] = sort(secondRP); %define the number of cross-validation splits: numOfSplits = 10; theseIndices = zeros(numOfSplits,2); thebasic = floor(size(y,1)/numOfSplits); theseIndices(1,1) = 1; for gg = 1:(numOfSplits-1) theseIndices(gg,2) = thebasic*gg; theseIndices(gg+1,1) = thebasic*gg + 1; end theseIndices(numOfSplits,2) = size(y,1); theseIndices2 = zeros(numOfSplits,2); thebasic2 = floor(size(y,2)/numOfSplits); theseIndices2(1,1) = 1; for gg = 1:(numOfSplits-1) theseIndices2(gg,2) = thebasic2*gg; theseIndices2(gg+1,1) = thebasic2*gg + 1; end theseIndices2(numOfSplits,2) = size(y,2); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Fixing each target in turn, then doing ten fold cross validation %on the set of compounds, to predict which ones %are/are not targeting the target in question: for i=1:size(y,1); i pause(.3) currentY = y(i,:)'; for j = 1:lengthKCompound %%%!%%% whatSplit = sum(j >= theseIndices2(:,1)); %%%!%%% %j; if sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1) > 0 %these are the weights for positive and negative examples. They alter the behavior of libsvm. m2a = 1; m2b = 1; %defining the train and test matrices that will be input into libsvm. %%%!%%% K1 = [[1:(lengthKCompound-(theseIndices2(whatSplit,2)-theseIndices2(whatSplit,1)+1))]' kCompound(setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]),setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))]; K2 = [(j)' kCompound(j,setdiff(1:lengthKCompound,[theseIndices2(whatSplit,1):theseIndices2(whatSplit,2)]))]; %%%!%%% %training: model = svmtrain(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))),K1,strcat(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)])); %testing: [predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model); %%!%% %this line is to fix a weakness in libsvm, the first training label is always automatically %labeled +1, even if it's -1 ! firstLabel = currentY(1)*(j>theseIndices2(1,2)) + currentY(theseIndices2(2,1))*(j<=theseIndices2(1,2)); %%!%% myPredictions(j,i) = dec_values*sign(firstLabel - 1/2); numbEdges1(j,i) = sum(currentY(setdiff(1:lengthKCompound,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1); else myPredictions(j,i) = -1; numbEdges1(j,i) = 0; end end end %un-permuting the row and column orders: myPredictions = myPredictions(mb1,mb2); ytemp = y'; ytemp = ytemp(mb1,mb2); ytemp = ytemp'; %Turning myPredictions and "y" into vectors: myScores =[]; myTrueLabels = []; for k = 1:lengthKCompound myScores = [myScores myPredictions(k,:)]; myTrueLabels = [myTrueLabels ytemp(:,k)']; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Fixing each compound in turn, then doing ten fold cross validation %on the set of targets, to predict which ones %are/are not targeted by the compound in question: for i= 1:size(y,2); i; currentY = y(:,i); for j = 1:lengthKTarget %%%!%%% whatSplit = sum(j >= theseIndices(:,1)); %%%!%%% if sum(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))) == 1) > 0 %these are the weights for positive and negative examples. They alter the behavior of %libsvm. m2a = 1; m2b = 1; %matrices to input to libsvm: K1 = [[1:(lengthKTarget-(theseIndices(whatSplit,2)-theseIndices(whatSplit,1)+1))]' kTarget(setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]),setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))]; K2 = [(j)' kTarget(j,setdiff(1:lengthKTarget,[theseIndices(whatSplit,1):theseIndices(whatSplit,2)]))]; %training: model = svmtrain(currentY(setdiff(1:lengthKTarget,theseIndices(whatSplit,1):theseIndices(whatSplit,2))),K1,strcat(['-t 4 -c 1 -w1 ',num2str(m2a),' -w-1 ',num2str(m2b)])); %testing: [predict_label,accuracy,dec_values] = svmpredict(currentY(j),K2, model); %fixing libsvms desire to label -1 examples as +1 examples if they are the first item in the %training set. firstLabel = currentY(1)*(j>theseIndices(1,2)) + currentY(theseIndices(2,1))*(j<=theseIndices(1,2)); myPredictions2(j,i) = dec_values*sign(firstLabel - 1/2); numbEdges2(j,i) = sum(currentY(setdiff(1:lengthKTarget,theseIndices2(whatSplit,1):theseIndices2(whatSplit,2))) == 1); else myPredictions2(j,i) = -1; numbEdges2(j,i) = 0; end end end %un-permuting the row and column orders: myPredictions2 = myPredictions2'; myPredictions2 = myPredictions2(mb1,mb2); myPredictions2 = myPredictions2'; y = y(mb2,mb1); %Turning myPredictions and "y" into vectors: myScores2 =[]; myTrueLabels2 = []; for k = 1:lengthKTarget myScores2 = [myScores2 myPredictions2(k,:)]; myTrueLabels2 = [myTrueLabels2 y(k,:)]; end %and again, but this time by column, (for later): myScores2b =[]; myTrueLabels2b = []; for k = 1:lengthKCompound myScores2b = [myScores2b myPredictions2(:,k)']; myTrueLabels2b = [myTrueLabels2b y(:,k)']; end % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %AREA UNDER ROC CURVE for fixing targets in turn: myNumb = length((-2.5):.001:(2.5)); TruPos = zeros(1,myNumb); FalPos = zeros(1,myNumb); countMan = 0; for moveit = (-2.5):.001:(2.5) countMan = countMan + 1; TruPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==1); FalPos(countMan) = sum(sign(myScores + moveit)==1 & myTrueLabels==0); end plot(FalPos/max(FalPos),TruPos/max(TruPos),'r'); hold; % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % %calculating the area under the ROC curve: % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% xx = FalPos/max(FalPos); yy = TruPos/max(TruPos); areaUnderROC = 0; old = [0 0]; for bb = 1:myNumb new = [xx(bb) yy(bb)]; if new(1) == old(1) & new(2) > old(2) old = new; elseif new(1) > old(1) & new(2) > old(2) areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1)); old = new; elseif new(1) > old(1) & new(2) == old(2) areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1)); old = new; end end areaUnderROC % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % %AREA UNDER ROC CURVE for fixing compounds in turn: myNumb = length((-2.5):.001:(2.5)); TruPos2 = zeros(1,myNumb); FalPos2 = zeros(1,myNumb); countMan = 0; for moveit = (-2.5):.001:(2.5) countMan = countMan + 1; TruPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==1); FalPos2(countMan) = sum(sign(myScores2 + moveit)==1 & myTrueLabels2==0); end plot(FalPos2/max(FalPos2),TruPos2/max(TruPos2),'b'); % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % %calculating the area under the ROC curve: % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% xx = FalPos2/max(FalPos2); yy = TruPos2/max(TruPos2); areaUnderROC = 0; old = [0 0]; for bb = 1:myNumb new = [xx(bb) yy(bb)]; if new(1) == old(1) & new(2) > old(2) old = new; elseif new(1) > old(1) & new(2) > old(2) areaUnderROC = areaUnderROC + ((old(2) + new(2))/2) * (new(1) - old(1)); old = new; elseif new(1) > old(1) & new(2) == old(2) areaUnderROC = areaUnderROC + old(2) * (new(1) - old(1)); old = new; end end areaUnderROC % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % %AREA UNDER ROC CURVE for both situations added together: %various ways of combining the two predictions made for each edge: %1) adding them: %myPredictions3 = myPredictions + myPredictions2'; %2) taking the largest: myPredictions3 = max(myPredictions,myPredictions2'); myScores3 =[]; myTrueLabels3 = []; for k = 1:lengthKCompound myScores3 = [myScores3 myPredictions3(k,:)]; myTrueLabels3 = [myTrueLabels3 y(:,k)']; end myNumb = length((-5):.001:(5)); TruPos3 = zeros(1,myNumb); FalPos3 = zeros(1,myNumb); countMan = 0; for moveit = (-5):.001:(5) countMan = countMan + 1; TruPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==1); FalPos3(countMan) = sum(sign(myScores3 + moveit)==1 & myTrueLabels3==0); end plot(FalPos3/max(FalPos3),TruPos3/max(TruPos3),'g'); hold off; % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % %calculating the area under the ROC curve: % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% xx3 = FalPos3/max(FalPos3); yy3 = TruPos3/max(TruPos3); areaUnderROC3 = 0; old3 = [0 0]; for bb = 1:length(xx3) new3 = [xx3(bb) yy3(bb)]; if new3(1) == old3(1) & new3(2) > old3(2) old3 = new3; elseif new3(1) > old3(1) & new3(2) > old3(2) areaUnderROC3 = areaUnderROC3 + ((old3(2) + new3(2))/2) * (new3(1) - old3(1)); old3 = new3; elseif new3(1) > old3(1) & new3(2) == old3(2) areaUnderROC3 = areaUnderROC3 + old3(2) * (new3(1) - old3(1)); old3 = new3; end end areaUnderROC3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %here we calculate area under precision-recall curve, only for the 3rd "max" data set. %by copying and pasting and changing all the '3' to '1' or '2', %you can do this for the TruePos1 etc., and TruePos2 etc. %FDR curves and/or precision-recall curves: xFDRPR = TruPos3/max(TruPos3); yFDR = FalPos3./(FalPos3 + TruPos3); yPR = 1 - yFDR; plot(xFDRPR,yFDR); plot(xFDRPR,yPR); %area under precision accuracy curve: (actually it's a code for area under the FDR curve, so %in the last line we do "1 - result". xxPR = TruPos3/max(TruPos3); yyPR = FalPos3./(FalPos3 + TruPos3 + .000000001*(FalPos3==0 & TruPos3==0)); xRun = 0; addToSum = 0; myXa = 0; for myCounta = 1:myNumb if xxPR(myCounta) > xRun & myCounta > 1 myXb = xxPR(myCounta); addToSum = addToSum + (myXb - myXa)*(yyPR(myCounta - 1) + yyPR(myCounta))/2; myXa = myXb; xRun = xxPR(myCounta); elseif xxPR(myCounta) > xRun & myCounta == 1 myXa = xxPR(1); xRun = xxPR(myCounta); end end AUPR = 1 - addToSum %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%