RNASeq MATLAB
Da Bioingegneria Elettronica e Informatica.
Versione del 30 nov 2019 alle 12:12 di Profbevilacqua (Discussione | contributi)
Nicola Altini nicola.altini@poliba.it
Simona De Summa s.desumma@oncologico.bari.it
Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it
Example: lncRNAs in prostate cancer in relation to Gleason score
%% lncRNAs in prostate cancer in relation to Gleason score
%%% Read Input Data
path_images_folder = "MATLAB/RNASeq/images/";
% Loading read counts data
filenameGenData = "matrix_seminario.csv";
genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1);
% Loading clinical data
filenameClinicalData = "design_gleason.csv";
clinicalData = readtable(filenameClinicalData);
% Count Features and Samples
nFeatures = size(genData, 1);
nPatients = size(genData, 2);
fprintf("There are %d features and %d samples\n", nFeatures, nPatients);
<syntaxhighlight/>
=== Labels Assignment ===
<syntaxhighlight lang="matlab" line>
%%% Labels
% Create label vectors for high/low gleason score patients
labels_low = [];
labels_high = [];
for i = 1:nPatients
label = clinicalData{i,3}{1};
if label == "low"
labels_low = [labels_low, i];
else
labels_high = [labels_high, i];
end
end
nLows = numel(labels_low);
nHighs = numel(labels_high);
fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs);
<syntaxhighlight/>
=== Dimensionality Reduction ===
==== PCA ====
<syntaxhighlight lang="matlab" line>
%% PCA
genDataArray = table2array(genData);
genDataArrayT = genDataArray';
genDataArrayNormT = log2(genDataArrayT + 1);
[coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0);
%%% Plot PC1 + PC2
figure
scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black');
axis([-0.5 0.5 -0.5 0.5])
hold on
scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red');
<syntaxhighlight/>
==== t-SNE ====
<syntaxhighlight lang="matlab" line>
%% tSNE
nDim = 3;
genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim);
figure
if nDim == 2
scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black');
hold on
scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red');
elseif nDim == 3
scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ...
genDataEmbedding(labels_low,3), 3, 'black');
hold on
scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ...
genDataEmbedding(labels_high,3), 3, 'red');
end
<syntaxhighlight/>
=== Normalizing Read Counts ===
<syntaxhighlight lang="matlab" line>
%% Normalizing Read Counts
counts = genDataArray;
% estimate pseudo-reference with geometric mean row by row
pseudoRefSample = geomean(counts,2);
nz = pseudoRefSample > 0;
ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz));
sizeFactors = median(ratios,1);
% transform to common scale
normCounts = bsxfun(@rdivide,counts,sizeFactors);
normCounts(1:10,:);
%% Boxplots
figure;
subplot(2,1,1)
maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal')
ylabel('sample');
xlabel('log2(counts)');
subplot(2,1,2)
maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal')
ylabel('sample');
xlabel('log2(counts)');
<syntaxhighlight/>
=== Computing Mean, Dispersion and Fold Change ===
<syntaxhighlight lang="matlab" line>
%% Computing Mean, Dispersion and Fold Change
% consider the mean
% For each feature, we compute the mean across all the patients with
% high/low gleason
meanHigh = mean(normCounts(:,labels_high),2);
meanLow = mean(normCounts(:,labels_low),2);
% consider the dispersion
dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh;
dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow;
% plot on a log-log scale
figure;
loglog(meanHigh,dispHigh,'r.');
hold on;
loglog(meanLow,dispLow,'b.');
xlabel('log2(Mean)');
ylabel('log2(Dispersion)');
legend('High','Low','Location','southwest');
% compute the mean and the log2FC
meanBase = (meanHigh + meanLow) / 2;
foldChange = meanHigh ./ meanLow;
log2FC = log2(foldChange);
<syntaxhighlight/>
=== MA Plot ===
<syntaxhighlight lang="matlab" line>
%% MA Plot
% plot mean vs. fold change (MA plot)
mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true);
set(get(gca,'Xlabel'),'String','mean of normalized counts')
set(get(gca,'Ylabel'),'String','log2(fold change)')
<syntaxhighlight/>
==== Interactive ====
<syntaxhighlight lang="matlab" line>
%% Interactive MA Plot
mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA');
<syntaxhighlight/>
=== Gene Table ===
<syntaxhighlight lang="matlab" line>
%% Create Gene Table
% create table with statistics about each gene
geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC);
geneTable.Properties.RowNames = genData.Properties.RowNames;
% summary
summary(geneTable)
% preview
geneTable(1:10,:)
<syntaxhighlight/>
=== Inferring Differential Expression with a Negative Binomial Model ===
==== Constant Variance Link ====
<syntaxhighlight lang="matlab" line>
%% Constant Variance Link
tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),...
'VarianceLink','Constant');
h = plotVarianceLink(tConstant, 'Compare', 0);
% set custom title
h(1).Title.String = 'Variance Link on Low Samples';
h(2).Title.String = 'Variance Link on High Samples';
% save
saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png");
saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png");
<syntaxhighlight/>
==== Local Regression Variance Link ====
<syntaxhighlight lang="matlab" line>
%% Local Regression Variance Link
tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression');
h = plotVarianceLink(tLocal, 'Compare', 1);
% set custom title
h(1).Title.String = 'Variance Link on Low Samples';
h(2).Title.String = 'Variance Link on High Samples';
% save
saveas(h(1),path_images_folder + "tLocal_treated_prostate.png");
saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png");
<syntaxhighlight/>
=== Histogram of P-values ===
<syntaxhighlight lang="matlab" line>
%% Histogram of P-values
h = figure;
histogram(tLocal.pValue,100)
xlabel('P-value');
ylabel('Frequency');
title('P-value enrichment')
saveas(h, path_images_folder + "p-values_histogram_gleason.png");
<syntaxhighlight/>
==== Histogram of P-values without low count genes ====
<syntaxhighlight lang="matlab" line>
%%% Histogram of P-values without low count genes
h = figure;
lowCountThreshold = 10;
lowCountGenes = all(counts < lowCountThreshold, 2);
histogram(tLocal.pValue(~lowCountGenes),100)
xlabel('P-value');
ylabel('Frequency');
title('P-value enrichment without low count genes')
saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png");
<syntaxhighlight/>
=== Multiple testing adjusted P-values ===
<syntaxhighlight lang="matlab" line>
%% Multiple testing adjusted P-values
% compute the adjusted P-values (BH correction)
padj = mafdr(tLocal.pValue,'BHFDR',true);
% add to the existing table
geneTable.pvalue = tLocal.pValue;
geneTable.padj = padj;
% create a table with significant genes using adjusted p-values
sig = geneTable.padj < 0.1;
geneTableSig = geneTable(sig,:);
geneTableSig = sortrows(geneTableSig,'padj');
numberSigGenes = size(geneTableSig,1);
fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures);
<syntaxhighlight/>
=== Identifying the Most Up-regulated and Down-regulated Genes ===
<syntaxhighlight lang="matlab" line>
%% Identifying the Most Up-regulated and Down-regulated Genes
% find up-regulated genes
up = geneTableSig.log2FC > 1;
upGenes = sortrows(geneTableSig(up,:),'log2FC','descend');
numberSigGenesUp = sum(up);
fprintf('There are %d Up-regulated genes\n', numberSigGenesUp);
% find down-regulated genes
down = geneTableSig.log2FC < -1;
downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend');
numberSigGenesDown = sum(down);
fprintf('There are %d Down-regulated genes\n', numberSigGenesUp);
<syntaxhighlight/>
=== Plot FC vs Mean ===
<syntaxhighlight lang="matlab" line>
%% Plot FC vs Mean
% A good visualization of the gene expressions and their significance is given by
% plotting the fold change versus the mean in log scale and coloring the data points according
% to the adjusted P-values.
figure
scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o')
colormap(flipud(cool(256)));
colorbar;
ylabel('log2(Fold Change)');
xlabel('log2(Mean of normalized counts)');
title('Fold change by FDR')
% You can see here that for weakly expressed genes (i.e. those with low means),
% the FDR is generally high because low read counts are dominated by Poisson noise
% and consequently any biological variability is drowned in the uncertainties from the read counting.
<syntaxhighlight/>