RNASeq MATLAB: differenze tra le versioni
Da Bioingegneria Elettronica e Informatica.
Riga 23: | Riga 23: | ||
nPatients = size(genData, 2); | nPatients = size(genData, 2); | ||
fprintf("There are %d features and %d samples\n", nFeatures, nPatients); | fprintf("There are %d features and %d samples\n", nFeatures, nPatients); | ||
− | < | + | </syntaxhighlight> |
=== Labels Assignment === | === Labels Assignment === | ||
Riga 42: | Riga 42: | ||
nHighs = numel(labels_high); | nHighs = numel(labels_high); | ||
fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs); | fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs); | ||
− | + | ||
=== Dimensionality Reduction === | === Dimensionality Reduction === | ||
Riga 59: | Riga 59: | ||
hold on | hold on | ||
scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red'); | scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red'); | ||
− | < | + | </syntaxhighlight> |
==== t-SNE ==== | ==== t-SNE ==== | ||
Riga 78: | Riga 78: | ||
genDataEmbedding(labels_high,3), 3, 'red'); | genDataEmbedding(labels_high,3), 3, 'red'); | ||
end | end | ||
− | < | + | </syntaxhighlight> |
=== Normalizing Read Counts === | === Normalizing Read Counts === | ||
Riga 107: | Riga 107: | ||
ylabel('sample'); | ylabel('sample'); | ||
xlabel('log2(counts)'); | xlabel('log2(counts)'); | ||
− | < | + | </syntaxhighlight> |
=== Computing Mean, Dispersion and Fold Change === | === Computing Mean, Dispersion and Fold Change === | ||
Riga 135: | Riga 135: | ||
foldChange = meanHigh ./ meanLow; | foldChange = meanHigh ./ meanLow; | ||
log2FC = log2(foldChange); | log2FC = log2(foldChange); | ||
− | < | + | </syntaxhighlight> |
=== MA Plot === | === MA Plot === | ||
Riga 144: | Riga 144: | ||
set(get(gca,'Xlabel'),'String','mean of normalized counts') | set(get(gca,'Xlabel'),'String','mean of normalized counts') | ||
set(get(gca,'Ylabel'),'String','log2(fold change)') | set(get(gca,'Ylabel'),'String','log2(fold change)') | ||
− | < | + | </syntaxhighlight> |
==== Interactive ==== | ==== Interactive ==== | ||
Riga 150: | Riga 150: | ||
%% Interactive MA Plot | %% Interactive MA Plot | ||
mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA'); | mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA'); | ||
− | < | + | </syntaxhighlight> |
=== Gene Table === | === Gene Table === | ||
Riga 163: | Riga 163: | ||
% preview | % preview | ||
geneTable(1:10,:) | geneTable(1:10,:) | ||
− | < | + | </syntaxhighlight> |
=== Inferring Differential Expression with a Negative Binomial Model === | === Inferring Differential Expression with a Negative Binomial Model === | ||
Riga 178: | Riga 178: | ||
saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png"); | saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png"); | ||
saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png"); | saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png"); | ||
− | < | + | </syntaxhighlight> |
==== Local Regression Variance Link ==== | ==== Local Regression Variance Link ==== | ||
Riga 191: | Riga 191: | ||
saveas(h(1),path_images_folder + "tLocal_treated_prostate.png"); | saveas(h(1),path_images_folder + "tLocal_treated_prostate.png"); | ||
saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png"); | saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png"); | ||
− | < | + | </syntaxhighlight> |
=== Histogram of P-values === | === Histogram of P-values === | ||
Riga 202: | Riga 202: | ||
title('P-value enrichment') | title('P-value enrichment') | ||
saveas(h, path_images_folder + "p-values_histogram_gleason.png"); | saveas(h, path_images_folder + "p-values_histogram_gleason.png"); | ||
− | < | + | </syntaxhighlight> |
==== Histogram of P-values without low count genes ==== | ==== Histogram of P-values without low count genes ==== | ||
Riga 215: | Riga 215: | ||
title('P-value enrichment without low count genes') | title('P-value enrichment without low count genes') | ||
saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png"); | saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png"); | ||
− | < | + | </syntaxhighlight> |
=== Multiple testing adjusted P-values === | === Multiple testing adjusted P-values === | ||
Riga 234: | Riga 234: | ||
fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures); | fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures); | ||
− | < | + | </syntaxhighlight> |
=== Identifying the Most Up-regulated and Down-regulated Genes === | === Identifying the Most Up-regulated and Down-regulated Genes === | ||
Riga 250: | Riga 250: | ||
numberSigGenesDown = sum(down); | numberSigGenesDown = sum(down); | ||
fprintf('There are %d Down-regulated genes\n', numberSigGenesUp); | fprintf('There are %d Down-regulated genes\n', numberSigGenesUp); | ||
− | < | + | </syntaxhighlight> |
=== Plot FC vs Mean === | === Plot FC vs Mean === | ||
Riga 270: | Riga 270: | ||
% the FDR is generally high because low read counts are dominated by Poisson noise | % the FDR is generally high because low read counts are dominated by Poisson noise | ||
% and consequently any biological variability is drowned in the uncertainties from the read counting. | % and consequently any biological variability is drowned in the uncertainties from the read counting. | ||
− | < | + | </syntaxhighlight> |
Versione delle 12:14, 30 nov 2019
Nicola Altini nicola.altini@poliba.it
Simona De Summa s.desumma@oncologico.bari.it
Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it
Indice
- 1 Example: lncRNAs in prostate cancer in relation to Gleason score
- 1.1 Labels Assignment
- 1.2 Normalizing Read Counts
- 1.3 Computing Mean, Dispersion and Fold Change
- 1.4 MA Plot
- 1.5 Gene Table
- 1.6 Inferring Differential Expression with a Negative Binomial Model
- 1.7 Histogram of P-values
- 1.8 Multiple testing adjusted P-values
- 1.9 Identifying the Most Up-regulated and Down-regulated Genes
- 1.10 Plot FC vs Mean
Example: lncRNAs in prostate cancer in relation to Gleason score
%% lncRNAs in prostate cancer in relation to Gleason score
%%% Read Input Data
path_images_folder = "MATLAB/RNASeq/images/";
% Loading read counts data
filenameGenData = "matrix_seminario.csv";
genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1);
% Loading clinical data
filenameClinicalData = "design_gleason.csv";
clinicalData = readtable(filenameClinicalData);
% Count Features and Samples
nFeatures = size(genData, 1);
nPatients = size(genData, 2);
fprintf("There are %d features and %d samples\n", nFeatures, nPatients);
Labels Assignment
%%% Labels
% Create label vectors for high/low gleason score patients
labels_low = [];
labels_high = [];
for i = 1:nPatients
label = clinicalData{i,3}{1};
if label == "low"
labels_low = [labels_low, i];
else
labels_high = [labels_high, i];
end
end
nLows = numel(labels_low);
nHighs = numel(labels_high);
fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs);
=== Dimensionality Reduction ===
==== PCA ====
<syntaxhighlight lang="matlab" line>
%% PCA
genDataArray = table2array(genData);
genDataArrayT = genDataArray';
genDataArrayNormT = log2(genDataArrayT + 1);
[coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0);
%%% Plot PC1 + PC2
figure
scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black');
axis([-0.5 0.5 -0.5 0.5])
hold on
scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red');
t-SNE
%% tSNE
nDim = 3;
genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim);
figure
if nDim == 2
scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black');
hold on
scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red');
elseif nDim == 3
scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ...
genDataEmbedding(labels_low,3), 3, 'black');
hold on
scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ...
genDataEmbedding(labels_high,3), 3, 'red');
end
Normalizing Read Counts
%% Normalizing Read Counts
counts = genDataArray;
% estimate pseudo-reference with geometric mean row by row
pseudoRefSample = geomean(counts,2);
nz = pseudoRefSample > 0;
ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz));
sizeFactors = median(ratios,1);
% transform to common scale
normCounts = bsxfun(@rdivide,counts,sizeFactors);
normCounts(1:10,:);
%% Boxplots
figure;
subplot(2,1,1)
maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal')
ylabel('sample');
xlabel('log2(counts)');
subplot(2,1,2)
maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal')
ylabel('sample');
xlabel('log2(counts)');
Computing Mean, Dispersion and Fold Change
%% Computing Mean, Dispersion and Fold Change
% consider the mean
% For each feature, we compute the mean across all the patients with
% high/low gleason
meanHigh = mean(normCounts(:,labels_high),2);
meanLow = mean(normCounts(:,labels_low),2);
% consider the dispersion
dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh;
dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow;
% plot on a log-log scale
figure;
loglog(meanHigh,dispHigh,'r.');
hold on;
loglog(meanLow,dispLow,'b.');
xlabel('log2(Mean)');
ylabel('log2(Dispersion)');
legend('High','Low','Location','southwest');
% compute the mean and the log2FC
meanBase = (meanHigh + meanLow) / 2;
foldChange = meanHigh ./ meanLow;
log2FC = log2(foldChange);
MA Plot
%% MA Plot
% plot mean vs. fold change (MA plot)
mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true);
set(get(gca,'Xlabel'),'String','mean of normalized counts')
set(get(gca,'Ylabel'),'String','log2(fold change)')
Interactive
%% Interactive MA Plot
mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA');
Gene Table
%% Create Gene Table
% create table with statistics about each gene
geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC);
geneTable.Properties.RowNames = genData.Properties.RowNames;
% summary
summary(geneTable)
% preview
geneTable(1:10,:)
Inferring Differential Expression with a Negative Binomial Model
Constant Variance Link
%% Constant Variance Link
tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),...
'VarianceLink','Constant');
h = plotVarianceLink(tConstant, 'Compare', 0);
% set custom title
h(1).Title.String = 'Variance Link on Low Samples';
h(2).Title.String = 'Variance Link on High Samples';
% save
saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png");
saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png");
Local Regression Variance Link
%% Local Regression Variance Link
tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression');
h = plotVarianceLink(tLocal, 'Compare', 1);
% set custom title
h(1).Title.String = 'Variance Link on Low Samples';
h(2).Title.String = 'Variance Link on High Samples';
% save
saveas(h(1),path_images_folder + "tLocal_treated_prostate.png");
saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png");
Histogram of P-values
%% Histogram of P-values
h = figure;
histogram(tLocal.pValue,100)
xlabel('P-value');
ylabel('Frequency');
title('P-value enrichment')
saveas(h, path_images_folder + "p-values_histogram_gleason.png");
Histogram of P-values without low count genes
%%% Histogram of P-values without low count genes
h = figure;
lowCountThreshold = 10;
lowCountGenes = all(counts < lowCountThreshold, 2);
histogram(tLocal.pValue(~lowCountGenes),100)
xlabel('P-value');
ylabel('Frequency');
title('P-value enrichment without low count genes')
saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png");
Multiple testing adjusted P-values
%% Multiple testing adjusted P-values
% compute the adjusted P-values (BH correction)
padj = mafdr(tLocal.pValue,'BHFDR',true);
% add to the existing table
geneTable.pvalue = tLocal.pValue;
geneTable.padj = padj;
% create a table with significant genes using adjusted p-values
sig = geneTable.padj < 0.1;
geneTableSig = geneTable(sig,:);
geneTableSig = sortrows(geneTableSig,'padj');
numberSigGenes = size(geneTableSig,1);
fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures);
Identifying the Most Up-regulated and Down-regulated Genes
%% Identifying the Most Up-regulated and Down-regulated Genes
% find up-regulated genes
up = geneTableSig.log2FC > 1;
upGenes = sortrows(geneTableSig(up,:),'log2FC','descend');
numberSigGenesUp = sum(up);
fprintf('There are %d Up-regulated genes\n', numberSigGenesUp);
% find down-regulated genes
down = geneTableSig.log2FC < -1;
downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend');
numberSigGenesDown = sum(down);
fprintf('There are %d Down-regulated genes\n', numberSigGenesUp);
Plot FC vs Mean
%% Plot FC vs Mean
% A good visualization of the gene expressions and their significance is given by
% plotting the fold change versus the mean in log scale and coloring the data points according
% to the adjusted P-values.
figure
scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o')
colormap(flipud(cool(256)));
colorbar;
ylabel('log2(Fold Change)');
xlabel('log2(Mean of normalized counts)');
title('Fold change by FDR')
% You can see here that for weakly expressed genes (i.e. those with low means),
% the FDR is generally high because low read counts are dominated by Poisson noise
% and consequently any biological variability is drowned in the uncertainties from the read counting.