RNASeq MATLAB: differenze tra le versioni
Da Bioingegneria Elettronica e Informatica.
(Creata pagina con "== RNA-Seq with MATLAB == [http://www.vitoantoniobevilacqua.it/wiki/images/0/0f/RNAseq.pdf Slides] '''Vitoantonio Bevilacqua''' [mailto:vitoantonio.bevilacqua@poliba.it vito...") |
|||
(3 versioni intermedie di uno stesso utente non sono mostrate ) | |||
Riga 1: | Riga 1: | ||
− | + | '''Nicola Altini''' [mailto:nicola.altini@poliba.it nicola.altini@poliba.it] | |
− | [ | + | |
+ | '''Simona De Summa''' [mailto:s.desumma@oncologico.bari.it s.desumma@oncologico.bari.it] | ||
'''Vitoantonio Bevilacqua''' [mailto:vitoantonio.bevilacqua@poliba.it vitoantonio.bevilacqua@poliba.it] | '''Vitoantonio Bevilacqua''' [mailto:vitoantonio.bevilacqua@poliba.it vitoantonio.bevilacqua@poliba.it] | ||
− | + | [http://www.vitoantoniobevilacqua.it/wiki/images/0/0f/RNAseq.pdf Slides] | |
− | ''' | + | == Example: lncRNAs in prostate cancer in relation to Gleason score == |
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% lncRNAs in prostate cancer in relation to Gleason score | ||
+ | %%% Read Input Data | ||
+ | path_images_folder = "MATLAB/RNASeq/images/"; | ||
+ | % Loading read counts data | ||
+ | filenameGenData = "matrix_seminario.csv"; | ||
+ | genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1); | ||
+ | % Loading clinical data | ||
+ | filenameClinicalData = "design_gleason.csv"; | ||
+ | clinicalData = readtable(filenameClinicalData); | ||
+ | |||
+ | % Count Features and Samples | ||
+ | nFeatures = size(genData, 1); | ||
+ | nPatients = size(genData, 2); | ||
+ | fprintf("There are %d features and %d samples\n", nFeatures, nPatients); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Labels Assignment === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %%% Labels | ||
+ | % Create label vectors for high/low gleason score patients | ||
+ | labels_low = []; | ||
+ | labels_high = []; | ||
+ | for i = 1:nPatients | ||
+ | label = clinicalData{i,3}{1}; | ||
+ | if label == "low" | ||
+ | labels_low = [labels_low, i]; | ||
+ | else | ||
+ | labels_high = [labels_high, i]; | ||
+ | end | ||
+ | end | ||
+ | nLows = numel(labels_low); | ||
+ | nHighs = numel(labels_high); | ||
+ | fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Dimensionality Reduction === | ||
+ | ==== PCA ==== | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% PCA | ||
+ | genDataArray = table2array(genData); | ||
+ | genDataArrayT = genDataArray'; | ||
+ | genDataArrayNormT = log2(genDataArrayT + 1); | ||
+ | [coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0); | ||
+ | |||
+ | %%% Plot PC1 + PC2 | ||
+ | figure | ||
+ | scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black'); | ||
+ | axis([-0.5 0.5 -0.5 0.5]) | ||
+ | hold on | ||
+ | scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red'); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ==== t-SNE ==== | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% tSNE | ||
+ | nDim = 3; | ||
+ | genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim); | ||
+ | figure | ||
+ | if nDim == 2 | ||
+ | scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black'); | ||
+ | hold on | ||
+ | scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red'); | ||
+ | elseif nDim == 3 | ||
+ | scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ... | ||
+ | genDataEmbedding(labels_low,3), 3, 'black'); | ||
+ | hold on | ||
+ | scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ... | ||
+ | genDataEmbedding(labels_high,3), 3, 'red'); | ||
+ | end | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Normalizing Read Counts === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Normalizing Read Counts | ||
+ | counts = genDataArray; | ||
+ | |||
+ | % estimate pseudo-reference with geometric mean row by row | ||
+ | pseudoRefSample = geomean(counts,2); | ||
+ | nz = pseudoRefSample > 0; | ||
+ | ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz)); | ||
+ | sizeFactors = median(ratios,1); | ||
+ | |||
+ | % transform to common scale | ||
+ | normCounts = bsxfun(@rdivide,counts,sizeFactors); | ||
+ | normCounts(1:10,:); | ||
+ | |||
+ | %% Boxplots | ||
+ | figure; | ||
+ | |||
+ | subplot(2,1,1) | ||
+ | maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal') | ||
+ | ylabel('sample'); | ||
+ | xlabel('log2(counts)'); | ||
+ | |||
+ | subplot(2,1,2) | ||
+ | maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal') | ||
+ | ylabel('sample'); | ||
+ | xlabel('log2(counts)'); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Computing Mean, Dispersion and Fold Change === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Computing Mean, Dispersion and Fold Change | ||
+ | % consider the mean | ||
+ | % For each feature, we compute the mean across all the patients with | ||
+ | % high/low gleason | ||
+ | meanHigh = mean(normCounts(:,labels_high),2); | ||
+ | meanLow = mean(normCounts(:,labels_low),2); | ||
+ | |||
+ | % consider the dispersion | ||
+ | dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh; | ||
+ | dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow; | ||
+ | |||
+ | % plot on a log-log scale | ||
+ | figure; | ||
+ | loglog(meanHigh,dispHigh,'r.'); | ||
+ | hold on; | ||
+ | loglog(meanLow,dispLow,'b.'); | ||
+ | xlabel('log2(Mean)'); | ||
+ | ylabel('log2(Dispersion)'); | ||
+ | legend('High','Low','Location','southwest'); | ||
+ | |||
+ | % compute the mean and the log2FC | ||
+ | meanBase = (meanHigh + meanLow) / 2; | ||
+ | foldChange = meanHigh ./ meanLow; | ||
+ | log2FC = log2(foldChange); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === MA Plot === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% MA Plot | ||
+ | % plot mean vs. fold change (MA plot) | ||
+ | mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true); | ||
+ | set(get(gca,'Xlabel'),'String','mean of normalized counts') | ||
+ | set(get(gca,'Ylabel'),'String','log2(fold change)') | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ==== Interactive ==== | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Interactive MA Plot | ||
+ | mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA'); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Gene Table === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Create Gene Table | ||
+ | % create table with statistics about each gene | ||
+ | geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC); | ||
+ | geneTable.Properties.RowNames = genData.Properties.RowNames; | ||
+ | % summary | ||
+ | summary(geneTable) | ||
+ | |||
+ | % preview | ||
+ | geneTable(1:10,:) | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Inferring Differential Expression with a Negative Binomial Model === | ||
+ | ==== Constant Variance Link ==== | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Constant Variance Link | ||
+ | tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),... | ||
+ | 'VarianceLink','Constant'); | ||
+ | h = plotVarianceLink(tConstant, 'Compare', 0); | ||
+ | % set custom title | ||
+ | h(1).Title.String = 'Variance Link on Low Samples'; | ||
+ | h(2).Title.String = 'Variance Link on High Samples'; | ||
+ | % save | ||
+ | saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png"); | ||
+ | saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png"); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ==== Local Regression Variance Link ==== | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Local Regression Variance Link | ||
+ | tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression'); | ||
+ | h = plotVarianceLink(tLocal, 'Compare', 1); | ||
+ | % set custom title | ||
+ | h(1).Title.String = 'Variance Link on Low Samples'; | ||
+ | h(2).Title.String = 'Variance Link on High Samples'; | ||
+ | % save | ||
+ | saveas(h(1),path_images_folder + "tLocal_treated_prostate.png"); | ||
+ | saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png"); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Histogram of P-values === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Histogram of P-values | ||
+ | h = figure; | ||
+ | histogram(tLocal.pValue,100) | ||
+ | xlabel('P-value'); | ||
+ | ylabel('Frequency'); | ||
+ | title('P-value enrichment') | ||
+ | saveas(h, path_images_folder + "p-values_histogram_gleason.png"); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ==== Histogram of P-values without low count genes ==== | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %%% Histogram of P-values without low count genes | ||
+ | h = figure; | ||
+ | lowCountThreshold = 10; | ||
+ | lowCountGenes = all(counts < lowCountThreshold, 2); | ||
+ | histogram(tLocal.pValue(~lowCountGenes),100) | ||
+ | xlabel('P-value'); | ||
+ | ylabel('Frequency'); | ||
+ | title('P-value enrichment without low count genes') | ||
+ | saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png"); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Multiple testing adjusted P-values === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Multiple testing adjusted P-values | ||
+ | % compute the adjusted P-values (BH correction) | ||
+ | padj = mafdr(tLocal.pValue,'BHFDR',true); | ||
+ | |||
+ | % add to the existing table | ||
+ | geneTable.pvalue = tLocal.pValue; | ||
+ | geneTable.padj = padj; | ||
+ | |||
+ | % create a table with significant genes using adjusted p-values | ||
+ | sig = geneTable.padj < 0.1; | ||
+ | geneTableSig = geneTable(sig,:); | ||
+ | geneTableSig = sortrows(geneTableSig,'padj'); | ||
+ | numberSigGenes = size(geneTableSig,1); | ||
+ | |||
+ | fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Identifying the Most Up-regulated and Down-regulated Genes === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Identifying the Most Up-regulated and Down-regulated Genes | ||
+ | % find up-regulated genes | ||
+ | up = geneTableSig.log2FC > 1; | ||
+ | upGenes = sortrows(geneTableSig(up,:),'log2FC','descend'); | ||
+ | numberSigGenesUp = sum(up); | ||
+ | fprintf('There are %d Up-regulated genes\n', numberSigGenesUp); | ||
+ | |||
+ | % find down-regulated genes | ||
+ | down = geneTableSig.log2FC < -1; | ||
+ | downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend'); | ||
+ | numberSigGenesDown = sum(down); | ||
+ | fprintf('There are %d Down-regulated genes\n', numberSigGenesUp); | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | === Plot FC vs Mean === | ||
+ | <syntaxhighlight lang="matlab" line> | ||
+ | %% Plot FC vs Mean | ||
+ | % A good visualization of the gene expressions and their significance is given by | ||
+ | % plotting the fold change versus the mean in log scale and coloring the data points according | ||
+ | % to the adjusted P-values. | ||
+ | |||
+ | figure | ||
+ | scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o') | ||
+ | colormap(flipud(cool(256))); | ||
+ | colorbar; | ||
+ | ylabel('log2(Fold Change)'); | ||
+ | xlabel('log2(Mean of normalized counts)'); | ||
+ | title('Fold change by FDR') | ||
+ | |||
+ | % You can see here that for weakly expressed genes (i.e. those with low means), | ||
+ | % the FDR is generally high because low read counts are dominated by Poisson noise | ||
+ | % and consequently any biological variability is drowned in the uncertainties from the read counting. | ||
+ | </syntaxhighlight> |
Versione attuale delle 12:15, 30 nov 2019
Nicola Altini nicola.altini@poliba.it
Simona De Summa s.desumma@oncologico.bari.it
Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it
Indice
- 1 Example: lncRNAs in prostate cancer in relation to Gleason score
- 1.1 Labels Assignment
- 1.2 Dimensionality Reduction
- 1.3 Normalizing Read Counts
- 1.4 Computing Mean, Dispersion and Fold Change
- 1.5 MA Plot
- 1.6 Gene Table
- 1.7 Inferring Differential Expression with a Negative Binomial Model
- 1.8 Histogram of P-values
- 1.9 Multiple testing adjusted P-values
- 1.10 Identifying the Most Up-regulated and Down-regulated Genes
- 1.11 Plot FC vs Mean
Example: lncRNAs in prostate cancer in relation to Gleason score
%% lncRNAs in prostate cancer in relation to Gleason score
%%% Read Input Data
path_images_folder = "MATLAB/RNASeq/images/";
% Loading read counts data
filenameGenData = "matrix_seminario.csv";
genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1);
% Loading clinical data
filenameClinicalData = "design_gleason.csv";
clinicalData = readtable(filenameClinicalData);
% Count Features and Samples
nFeatures = size(genData, 1);
nPatients = size(genData, 2);
fprintf("There are %d features and %d samples\n", nFeatures, nPatients);
Labels Assignment
%%% Labels
% Create label vectors for high/low gleason score patients
labels_low = [];
labels_high = [];
for i = 1:nPatients
label = clinicalData{i,3}{1};
if label == "low"
labels_low = [labels_low, i];
else
labels_high = [labels_high, i];
end
end
nLows = numel(labels_low);
nHighs = numel(labels_high);
fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs);
Dimensionality Reduction
PCA
%% PCA
genDataArray = table2array(genData);
genDataArrayT = genDataArray';
genDataArrayNormT = log2(genDataArrayT + 1);
[coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0);
%%% Plot PC1 + PC2
figure
scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black');
axis([-0.5 0.5 -0.5 0.5])
hold on
scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red');
t-SNE
%% tSNE
nDim = 3;
genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim);
figure
if nDim == 2
scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black');
hold on
scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red');
elseif nDim == 3
scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ...
genDataEmbedding(labels_low,3), 3, 'black');
hold on
scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ...
genDataEmbedding(labels_high,3), 3, 'red');
end
Normalizing Read Counts
%% Normalizing Read Counts
counts = genDataArray;
% estimate pseudo-reference with geometric mean row by row
pseudoRefSample = geomean(counts,2);
nz = pseudoRefSample > 0;
ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz));
sizeFactors = median(ratios,1);
% transform to common scale
normCounts = bsxfun(@rdivide,counts,sizeFactors);
normCounts(1:10,:);
%% Boxplots
figure;
subplot(2,1,1)
maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal')
ylabel('sample');
xlabel('log2(counts)');
subplot(2,1,2)
maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal')
ylabel('sample');
xlabel('log2(counts)');
Computing Mean, Dispersion and Fold Change
%% Computing Mean, Dispersion and Fold Change
% consider the mean
% For each feature, we compute the mean across all the patients with
% high/low gleason
meanHigh = mean(normCounts(:,labels_high),2);
meanLow = mean(normCounts(:,labels_low),2);
% consider the dispersion
dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh;
dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow;
% plot on a log-log scale
figure;
loglog(meanHigh,dispHigh,'r.');
hold on;
loglog(meanLow,dispLow,'b.');
xlabel('log2(Mean)');
ylabel('log2(Dispersion)');
legend('High','Low','Location','southwest');
% compute the mean and the log2FC
meanBase = (meanHigh + meanLow) / 2;
foldChange = meanHigh ./ meanLow;
log2FC = log2(foldChange);
MA Plot
%% MA Plot
% plot mean vs. fold change (MA plot)
mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true);
set(get(gca,'Xlabel'),'String','mean of normalized counts')
set(get(gca,'Ylabel'),'String','log2(fold change)')
Interactive
%% Interactive MA Plot
mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA');
Gene Table
%% Create Gene Table
% create table with statistics about each gene
geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC);
geneTable.Properties.RowNames = genData.Properties.RowNames;
% summary
summary(geneTable)
% preview
geneTable(1:10,:)
Inferring Differential Expression with a Negative Binomial Model
Constant Variance Link
%% Constant Variance Link
tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),...
'VarianceLink','Constant');
h = plotVarianceLink(tConstant, 'Compare', 0);
% set custom title
h(1).Title.String = 'Variance Link on Low Samples';
h(2).Title.String = 'Variance Link on High Samples';
% save
saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png");
saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png");
Local Regression Variance Link
%% Local Regression Variance Link
tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression');
h = plotVarianceLink(tLocal, 'Compare', 1);
% set custom title
h(1).Title.String = 'Variance Link on Low Samples';
h(2).Title.String = 'Variance Link on High Samples';
% save
saveas(h(1),path_images_folder + "tLocal_treated_prostate.png");
saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png");
Histogram of P-values
%% Histogram of P-values
h = figure;
histogram(tLocal.pValue,100)
xlabel('P-value');
ylabel('Frequency');
title('P-value enrichment')
saveas(h, path_images_folder + "p-values_histogram_gleason.png");
Histogram of P-values without low count genes
%%% Histogram of P-values without low count genes
h = figure;
lowCountThreshold = 10;
lowCountGenes = all(counts < lowCountThreshold, 2);
histogram(tLocal.pValue(~lowCountGenes),100)
xlabel('P-value');
ylabel('Frequency');
title('P-value enrichment without low count genes')
saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png");
Multiple testing adjusted P-values
%% Multiple testing adjusted P-values
% compute the adjusted P-values (BH correction)
padj = mafdr(tLocal.pValue,'BHFDR',true);
% add to the existing table
geneTable.pvalue = tLocal.pValue;
geneTable.padj = padj;
% create a table with significant genes using adjusted p-values
sig = geneTable.padj < 0.1;
geneTableSig = geneTable(sig,:);
geneTableSig = sortrows(geneTableSig,'padj');
numberSigGenes = size(geneTableSig,1);
fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures);
Identifying the Most Up-regulated and Down-regulated Genes
%% Identifying the Most Up-regulated and Down-regulated Genes
% find up-regulated genes
up = geneTableSig.log2FC > 1;
upGenes = sortrows(geneTableSig(up,:),'log2FC','descend');
numberSigGenesUp = sum(up);
fprintf('There are %d Up-regulated genes\n', numberSigGenesUp);
% find down-regulated genes
down = geneTableSig.log2FC < -1;
downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend');
numberSigGenesDown = sum(down);
fprintf('There are %d Down-regulated genes\n', numberSigGenesUp);
Plot FC vs Mean
%% Plot FC vs Mean
% A good visualization of the gene expressions and their significance is given by
% plotting the fold change versus the mean in log scale and coloring the data points according
% to the adjusted P-values.
figure
scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o')
colormap(flipud(cool(256)));
colorbar;
ylabel('log2(Fold Change)');
xlabel('log2(Mean of normalized counts)');
title('Fold change by FDR')
% You can see here that for weakly expressed genes (i.e. those with low means),
% the FDR is generally high because low read counts are dominated by Poisson noise
% and consequently any biological variability is drowned in the uncertainties from the read counting.