RNASeq MATLAB

Da Bioingegneria Elettronica e Informatica.

Nicola Altini nicola.altini@poliba.it

Simona De Summa s.desumma@oncologico.bari.it

Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it

Slides

Example: lncRNAs in prostate cancer in relation to Gleason score

  1. %% lncRNAs in prostate cancer in relation to Gleason score
  2. %%% Read Input Data
  3. path_images_folder = "MATLAB/RNASeq/images/";
  4. % Loading read counts data
  5. filenameGenData = "matrix_seminario.csv";
  6. genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1);
  7. % Loading clinical data
  8. filenameClinicalData = "design_gleason.csv";
  9. clinicalData = readtable(filenameClinicalData);
  10.  
  11. % Count Features and Samples
  12. nFeatures = size(genData, 1);
  13. nPatients = size(genData, 2);
  14. fprintf("There are %d features and %d samples\n", nFeatures, nPatients);
  15. <syntaxhighlight/>
  16.  
  17. === Labels Assignment ===
  18. <syntaxhighlight lang="matlab" line>
  19. %%% Labels
  20. % Create label vectors for high/low gleason score patients
  21. labels_low = [];
  22. labels_high = [];
  23. for i = 1:nPatients
  24.     label = clinicalData{i,3}{1};
  25.     if label == "low"
  26.         labels_low = [labels_low, i];
  27.     else
  28.         labels_high = [labels_high, i];
  29.     end
  30. end
  31. nLows = numel(labels_low);
  32. nHighs = numel(labels_high);
  33. fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs);
  34. <syntaxhighlight/>
  35.  
  36. === Dimensionality Reduction ===
  37. ==== PCA ====
  38. <syntaxhighlight lang="matlab" line>
  39. %% PCA
  40. genDataArray = table2array(genData);
  41. genDataArrayT = genDataArray';
  42. genDataArrayNormT = log2(genDataArrayT + 1);
  43. [coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0);
  44.  
  45. %%% Plot PC1 + PC2
  46. figure
  47. scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black');
  48. axis([-0.5 0.5 -0.5 0.5])
  49. hold on 
  50. scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red');
  51. <syntaxhighlight/>
  52.  
  53. ==== t-SNE ====
  54. <syntaxhighlight lang="matlab" line>
  55. %% tSNE
  56. nDim = 3;
  57. genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim);
  58. figure
  59. if nDim == 2
  60.     scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black');
  61.     hold on
  62.     scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red');
  63. elseif  nDim == 3
  64.     scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ...
  65.              genDataEmbedding(labels_low,3), 3, 'black');
  66.     hold on
  67.     scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ...
  68.              genDataEmbedding(labels_high,3), 3, 'red');
  69. end
  70. <syntaxhighlight/>
  71.  
  72. === Normalizing Read Counts ===
  73. <syntaxhighlight lang="matlab" line>
  74. %% Normalizing Read Counts
  75. counts = genDataArray;
  76.  
  77. % estimate pseudo-reference with geometric mean row by row
  78. pseudoRefSample = geomean(counts,2);
  79. nz = pseudoRefSample > 0;
  80. ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz));
  81. sizeFactors = median(ratios,1);
  82.  
  83. % transform to common scale
  84. normCounts = bsxfun(@rdivide,counts,sizeFactors);
  85. normCounts(1:10,:);
  86.  
  87. %% Boxplots
  88. figure;
  89.  
  90. subplot(2,1,1)
  91. maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal')
  92. ylabel('sample');
  93. xlabel('log2(counts)');
  94.  
  95. subplot(2,1,2)
  96. maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal')
  97. ylabel('sample');
  98. xlabel('log2(counts)');
  99. <syntaxhighlight/>
  100.  
  101. === Computing Mean, Dispersion and Fold Change ===
  102. <syntaxhighlight lang="matlab" line>
  103. %% Computing Mean, Dispersion and Fold Change
  104. % consider the mean
  105. % For each feature, we compute the mean across all the patients with
  106. % high/low gleason
  107. meanHigh = mean(normCounts(:,labels_high),2);
  108. meanLow = mean(normCounts(:,labels_low),2);
  109.  
  110. % consider the dispersion
  111. dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh;
  112. dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow;
  113.  
  114. % plot on a log-log scale
  115. figure;
  116. loglog(meanHigh,dispHigh,'r.');
  117. hold on;
  118. loglog(meanLow,dispLow,'b.');
  119. xlabel('log2(Mean)');
  120. ylabel('log2(Dispersion)');
  121. legend('High','Low','Location','southwest');
  122.  
  123. % compute the mean and the log2FC
  124. meanBase = (meanHigh + meanLow) / 2;
  125. foldChange = meanHigh ./ meanLow;
  126. log2FC = log2(foldChange);
  127. <syntaxhighlight/>
  128.  
  129. === MA Plot ===
  130. <syntaxhighlight lang="matlab" line>
  131. %% MA Plot
  132. % plot mean vs. fold change (MA plot)
  133. mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true);
  134. set(get(gca,'Xlabel'),'String','mean of normalized counts')
  135. set(get(gca,'Ylabel'),'String','log2(fold change)')
  136. <syntaxhighlight/>
  137.  
  138. ==== Interactive ====
  139. <syntaxhighlight lang="matlab" line>
  140. %% Interactive MA Plot
  141. mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA');
  142. <syntaxhighlight/>
  143.  
  144. === Gene Table ===
  145. <syntaxhighlight lang="matlab" line>
  146. %% Create Gene Table
  147. % create table with statistics about each gene
  148. geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC);
  149. geneTable.Properties.RowNames = genData.Properties.RowNames;
  150. % summary
  151. summary(geneTable)
  152.  
  153. % preview
  154. geneTable(1:10,:)
  155. <syntaxhighlight/>
  156.  
  157. === Inferring Differential Expression with a Negative Binomial Model ===
  158. ==== Constant Variance Link ====
  159. <syntaxhighlight lang="matlab" line>
  160. %% Constant Variance Link  
  161. tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),...
  162.                   'VarianceLink','Constant');
  163. h = plotVarianceLink(tConstant, 'Compare', 0);
  164. % set custom title
  165. h(1).Title.String = 'Variance Link on Low Samples';
  166. h(2).Title.String = 'Variance Link on High Samples';
  167. % save
  168. saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png");
  169. saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png");
  170. <syntaxhighlight/>
  171.  
  172. ==== Local Regression Variance Link ====
  173. <syntaxhighlight lang="matlab" line>
  174. %% Local Regression Variance Link
  175. tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression');
  176. h = plotVarianceLink(tLocal, 'Compare', 1);
  177. % set custom title
  178. h(1).Title.String = 'Variance Link on Low Samples';
  179. h(2).Title.String = 'Variance Link on High Samples';
  180. % save
  181. saveas(h(1),path_images_folder + "tLocal_treated_prostate.png");
  182. saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png");
  183. <syntaxhighlight/>
  184.  
  185. === Histogram of P-values ===
  186. <syntaxhighlight lang="matlab" line>
  187. %% Histogram of P-values
  188. h = figure;
  189. histogram(tLocal.pValue,100)
  190. xlabel('P-value');
  191. ylabel('Frequency');
  192. title('P-value enrichment')
  193. saveas(h, path_images_folder + "p-values_histogram_gleason.png");
  194. <syntaxhighlight/>
  195.  
  196. ==== Histogram of P-values without low count genes ====
  197. <syntaxhighlight lang="matlab" line>
  198. %%% Histogram of P-values without low count genes
  199. h = figure;
  200. lowCountThreshold = 10;
  201. lowCountGenes = all(counts < lowCountThreshold, 2);
  202. histogram(tLocal.pValue(~lowCountGenes),100)
  203. xlabel('P-value');
  204. ylabel('Frequency');
  205. title('P-value enrichment without low count genes')
  206. saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png");
  207. <syntaxhighlight/>
  208.  
  209. === Multiple testing adjusted P-values ===
  210. <syntaxhighlight lang="matlab" line>
  211. %% Multiple testing adjusted P-values
  212. % compute the adjusted P-values (BH correction)
  213. padj = mafdr(tLocal.pValue,'BHFDR',true);
  214.  
  215. % add to the existing table
  216. geneTable.pvalue = tLocal.pValue;
  217. geneTable.padj = padj;
  218.  
  219. % create a table with significant genes using adjusted p-values
  220. sig = geneTable.padj < 0.1;
  221. geneTableSig = geneTable(sig,:);
  222. geneTableSig = sortrows(geneTableSig,'padj');
  223. numberSigGenes = size(geneTableSig,1);
  224.  
  225. fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures);
  226. <syntaxhighlight/> 
  227.  
  228. === Identifying the Most Up-regulated and Down-regulated Genes ===
  229. <syntaxhighlight lang="matlab" line>
  230. %% Identifying the Most Up-regulated and Down-regulated Genes
  231. % find up-regulated genes
  232. up = geneTableSig.log2FC > 1;
  233. upGenes = sortrows(geneTableSig(up,:),'log2FC','descend');
  234. numberSigGenesUp = sum(up);
  235. fprintf('There are %d Up-regulated genes\n', numberSigGenesUp);
  236.  
  237. % find down-regulated genes
  238. down = geneTableSig.log2FC < -1;
  239. downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend');
  240. numberSigGenesDown = sum(down);
  241. fprintf('There are %d Down-regulated genes\n', numberSigGenesUp);
  242. <syntaxhighlight/>
  243.  
  244. === Plot FC vs Mean ===
  245. <syntaxhighlight lang="matlab" line>
  246. %% Plot FC vs Mean
  247. % A good visualization of the gene expressions and their significance is given by 
  248. % plotting the fold change versus the mean in log scale and coloring the data points according 
  249. % to the adjusted P-values.
  250.  
  251. figure
  252. scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o')
  253. colormap(flipud(cool(256)));
  254. colorbar;
  255. ylabel('log2(Fold Change)');
  256. xlabel('log2(Mean of normalized counts)');
  257. title('Fold change by FDR')
  258.  
  259. % You can see here that for weakly expressed genes (i.e. those with low means), 
  260. % the FDR is generally high because low read counts are dominated by Poisson noise
  261. % and consequently any biological variability is drowned in the uncertainties from the read counting.
  262. <syntaxhighlight/>