RNASeq MATLAB: differenze tra le versioni

Versione attuale delle 12:15, 30 nov 2019

Nicola Altini nicola.altini@poliba.it

Simona De Summa s.desumma@oncologico.bari.it

Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it

Indice

1 Example: lncRNAs in prostate cancer in relation to Gleason score

Example: lncRNAs in prostate cancer in relation to Gleason score

%% lncRNAs in prostate cancer in relation to Gleason score

```
%%% Read Input Data
```

path_images_folder = "MATLAB/RNASeq/images/";

```
% Loading read counts data
```

filenameGenData = "matrix_seminario.csv";

genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1);

```
% Loading clinical data
```

filenameClinicalData = "design_gleason.csv";

clinicalData = readtable(filenameClinicalData);

```
 
```
```
% Count Features and Samples
```
```
nFeatures = size(genData, 1);
```
```
nPatients = size(genData, 2);
```

fprintf("There are %d features and %d samples\n", nFeatures, nPatients);

Labels Assignment

```
%%% Labels
```

% Create label vectors for high/low gleason score patients

```
labels_low = [];
```
```
labels_high = [];
```
```
for i = 1:nPatients
```
```
    label = clinicalData{i,3}{1};
```
```
    if label == "low"
```
```
        labels_low = [labels_low, i];
```
```
    else
```
```
        labels_high = [labels_high, i];
```
```
    end
```
```
end
```
```
nLows = numel(labels_low);
```
```
nHighs = numel(labels_high);
```

fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs);

Dimensionality Reduction

PCA

```
%% PCA
```
```
genDataArray = table2array(genData);
```
```
genDataArrayT = genDataArray';
```

genDataArrayNormT = log2(genDataArrayT + 1);

[coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0);

```
 
```
```
%%% Plot PC1 + PC2
```
```
figure
```

scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black');

```
axis([-0.5 0.5 -0.5 0.5])
```
```
hold on 
```

scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red');

t-SNE

```
%% tSNE
```
```
nDim = 3;
```

genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim);

```
figure
```
```
if nDim == 2
```

    scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black');

```
    hold on
```

    scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red');

```
elseif  nDim == 3
```

    scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ...

             genDataEmbedding(labels_low,3), 3, 'black');

```
    hold on
```

    scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ...

             genDataEmbedding(labels_high,3), 3, 'red');

```
end
```

Normalizing Read Counts

```
%% Normalizing Read Counts
```
```
counts = genDataArray;
```
```
 
```

% estimate pseudo-reference with geometric mean row by row

```
pseudoRefSample = geomean(counts,2);
```
```
nz = pseudoRefSample > 0;
```

ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz));

```
sizeFactors = median(ratios,1);
```
```
 
```
```
% transform to common scale
```

normCounts = bsxfun(@rdivide,counts,sizeFactors);

```
normCounts(1:10,:);
```
```
 
```
```
%% Boxplots
```
```
figure;
```
```
 
```
```
subplot(2,1,1)
```

maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal')

```
ylabel('sample');
```
```
xlabel('log2(counts)');
```
```
 
```
```
subplot(2,1,2)
```

maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal')

```
ylabel('sample');
```
```
xlabel('log2(counts)');
```

Computing Mean, Dispersion and Fold Change

%% Computing Mean, Dispersion and Fold Change

```
% consider the mean
```

% For each feature, we compute the mean across all the patients with

```
% high/low gleason
```

meanHigh = mean(normCounts(:,labels_high),2);

meanLow = mean(normCounts(:,labels_low),2);

```
 
```
```
% consider the dispersion
```

dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh;

dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow;

```
 
```
```
% plot on a log-log scale
```
```
figure;
```
```
loglog(meanHigh,dispHigh,'r.');
```
```
hold on;
```
```
loglog(meanLow,dispLow,'b.');
```
```
xlabel('log2(Mean)');
```
```
ylabel('log2(Dispersion)');
```

legend('High','Low','Location','southwest');

```
 
```
```
% compute the mean and the log2FC
```
```
meanBase = (meanHigh + meanLow) / 2;
```
```
foldChange = meanHigh ./ meanLow;
```
```
log2FC = log2(foldChange);
```

MA Plot

```
%% MA Plot
```
```
% plot mean vs. fold change (MA plot)
```

mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true);

set(get(gca,'Xlabel'),'String','mean of normalized counts')

set(get(gca,'Ylabel'),'String','log2(fold change)')

Interactive

```
%% Interactive MA Plot
```

mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA');

Gene Table

```
%% Create Gene Table
```

% create table with statistics about each gene

geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC);

geneTable.Properties.RowNames = genData.Properties.RowNames;

```
% summary
```
```
summary(geneTable)
```
```
 
```
```
% preview
```
```
geneTable(1:10,:)
```

Inferring Differential Expression with a Negative Binomial Model

Constant Variance Link

```
%% Constant Variance Link  
```

tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),...

                  'VarianceLink','Constant');

h = plotVarianceLink(tConstant, 'Compare', 0);

```
% set custom title
```

h(1).Title.String = 'Variance Link on Low Samples';

h(2).Title.String = 'Variance Link on High Samples';

```
% save
```

saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png");

saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png");

Local Regression Variance Link

```
%% Local Regression Variance Link
```

tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression');

h = plotVarianceLink(tLocal, 'Compare', 1);

```
% set custom title
```

h(1).Title.String = 'Variance Link on Low Samples';

h(2).Title.String = 'Variance Link on High Samples';

```
% save
```

saveas(h(1),path_images_folder + "tLocal_treated_prostate.png");

saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png");

Histogram of P-values

```
%% Histogram of P-values
```
```
h = figure;
```
```
histogram(tLocal.pValue,100)
```
```
xlabel('P-value');
```
```
ylabel('Frequency');
```
```
title('P-value enrichment')
```

saveas(h, path_images_folder + "p-values_histogram_gleason.png");

Histogram of P-values without low count genes

%%% Histogram of P-values without low count genes

```
h = figure;
```
```
lowCountThreshold = 10;
```

lowCountGenes = all(counts < lowCountThreshold, 2);

histogram(tLocal.pValue(~lowCountGenes),100)

```
xlabel('P-value');
```
```
ylabel('Frequency');
```

title('P-value enrichment without low count genes')

saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png");

Multiple testing adjusted P-values

```
%% Multiple testing adjusted P-values
```

% compute the adjusted P-values (BH correction)

padj = mafdr(tLocal.pValue,'BHFDR',true);

```
 
```
```
% add to the existing table
```
```
geneTable.pvalue = tLocal.pValue;
```
```
geneTable.padj = padj;
```
```
 
```

% create a table with significant genes using adjusted p-values

```
sig = geneTable.padj < 0.1;
```
```
geneTableSig = geneTable(sig,:);
```

geneTableSig = sortrows(geneTableSig,'padj');

```
numberSigGenes = size(geneTableSig,1);
```
```
 
```

fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures);

Identifying the Most Up-regulated and Down-regulated Genes

%% Identifying the Most Up-regulated and Down-regulated Genes

```
% find up-regulated genes
```
```
up = geneTableSig.log2FC > 1;
```

upGenes = sortrows(geneTableSig(up,:),'log2FC','descend');

```
numberSigGenesUp = sum(up);
```

fprintf('There are %d Up-regulated genes\n', numberSigGenesUp);

```
 
```
```
% find down-regulated genes
```
```
down = geneTableSig.log2FC < -1;
```

downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend');

```
numberSigGenesDown = sum(down);
```

fprintf('There are %d Down-regulated genes\n', numberSigGenesUp);

Plot FC vs Mean

```
%% Plot FC vs Mean
```

% A good visualization of the gene expressions and their significance is given by

% plotting the fold change versus the mean in log scale and coloring the data points according

```
% to the adjusted P-values.
```
```
 
```
```
figure
```

scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o')

```
colormap(flipud(cool(256)));
```
```
colorbar;
```
```
ylabel('log2(Fold Change)');
```

xlabel('log2(Mean of normalized counts)');

```
title('Fold change by FDR')
```
```
 
```

% You can see here that for weakly expressed genes (i.e. those with low means),

% the FDR is generally high because low read counts are dominated by Poisson noise

% and consequently any biological variability is drowned in the uncertainties from the read counting.

RNASeq MATLAB: differenze tra le versioni

Versione attuale delle 12:15, 30 nov 2019

Indice

Example: lncRNAs in prostate cancer in relation to Gleason score

Labels Assignment

Dimensionality Reduction

PCA

t-SNE

Normalizing Read Counts

Computing Mean, Dispersion and Fold Change

MA Plot

Interactive

Gene Table

Inferring Differential Expression with a Negative Binomial Model

Constant Variance Link

Local Regression Variance Link

Histogram of P-values

Histogram of P-values without low count genes

Multiple testing adjusted P-values

Identifying the Most Up-regulated and Down-regulated Genes

Plot FC vs Mean

Menu di navigazione

Strumenti personali

Namespace

Varianti

Visite

Azioni

Ricerca

Navigazione

Strumenti

@@ Riga 1: / Riga 1: @@
-== RNA-Seq with MATLAB ==
+'''Nicola Altini''' [mailto:nicola.altini@poliba.it nicola.altini@poliba.it]
-[http://www.vitoantoniobevilacqua.it/wiki/images/0/0f/RNAseq.pdf Slides]
+'''Simona De Summa''' [mailto:s.desumma@oncologico.bari.it s.desumma@oncologico.bari.it]
 '''Vitoantonio Bevilacqua''' [mailto:vitoantonio.bevilacqua@poliba.it vitoantonio.bevilacqua@poliba.it]
-'''Nicola Altini''' [mailto:nicola.altini@poliba.it nicola.altini@poliba.it]
+[http://www.vitoantoniobevilacqua.it/wiki/images/0/0f/RNAseq.pdf Slides]
-'''Simona De Summa''' [mailto:s.desumma@oncologico.bari.it s.desumma@oncologico.bari.it]
+== Example: lncRNAs in prostate cancer in relation to Gleason score ==
+<syntaxhighlight lang="matlab" line>
+%% lncRNAs in prostate cancer in relation to Gleason score
+%%% Read Input Data
+path_images_folder = "MATLAB/RNASeq/images/";
+% Loading read counts data
+filenameGenData = "matrix_seminario.csv";
+genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1);
+% Loading clinical data
+filenameClinicalData = "design_gleason.csv";
+clinicalData = readtable(filenameClinicalData);
+% Count Features and Samples
+nFeatures = size(genData, 1);
+nPatients = size(genData, 2);
+fprintf("There are %d features and %d samples\n", nFeatures, nPatients);
+</syntaxhighlight>
+=== Labels Assignment ===
+<syntaxhighlight lang="matlab" line>
+%%% Labels
+% Create label vectors for high/low gleason score patients
+labels_low = [];
+labels_high = [];
+for i = 1:nPatients
+    label = clinicalData{i,3}{1};
+    if label == "low"
+        labels_low = [labels_low, i];
+    else
+        labels_high = [labels_high, i];
+    end
+end
+nLows = numel(labels_low);
+nHighs = numel(labels_high);
+fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs);
+</syntaxhighlight>
+=== Dimensionality Reduction ===
+==== PCA ====
+<syntaxhighlight lang="matlab" line>
+%% PCA
+genDataArray = table2array(genData);
+genDataArrayT = genDataArray';
+genDataArrayNormT = log2(genDataArrayT + 1);
+[coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0);
+%%% Plot PC1 + PC2
+figure
+scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black');
+axis([-0.5 0.5 -0.5 0.5])
+hold on
+scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red');
+</syntaxhighlight>
+==== t-SNE ====
+<syntaxhighlight lang="matlab" line>
+%% tSNE
+nDim = 3;
+genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim);
+figure
+if nDim == 2
+    scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black');
+    hold on
+    scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red');
+elseif  nDim == 3
+    scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ...
+             genDataEmbedding(labels_low,3), 3, 'black');
+    hold on
+    scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ...
+             genDataEmbedding(labels_high,3), 3, 'red');
+end
+</syntaxhighlight>
+=== Normalizing Read Counts ===
+<syntaxhighlight lang="matlab" line>
+%% Normalizing Read Counts
+counts = genDataArray;
+% estimate pseudo-reference with geometric mean row by row
+pseudoRefSample = geomean(counts,2);
+nz = pseudoRefSample > 0;
+ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz));
+sizeFactors = median(ratios,1);
+% transform to common scale
+normCounts = bsxfun(@rdivide,counts,sizeFactors);
+normCounts(1:10,:);
+%% Boxplots
+figure;
+subplot(2,1,1)
+maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal')
+ylabel('sample');
+xlabel('log2(counts)');
+subplot(2,1,2)
+maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal')
+ylabel('sample');
+xlabel('log2(counts)');
+</syntaxhighlight>
+=== Computing Mean, Dispersion and Fold Change ===
+<syntaxhighlight lang="matlab" line>
+%% Computing Mean, Dispersion and Fold Change
+% consider the mean
+% For each feature, we compute the mean across all the patients with
+% high/low gleason
+meanHigh = mean(normCounts(:,labels_high),2);
+meanLow = mean(normCounts(:,labels_low),2);
+% consider the dispersion
+dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh;
+dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow;
+% plot on a log-log scale
+figure;
+loglog(meanHigh,dispHigh,'r.');
+hold on;
+loglog(meanLow,dispLow,'b.');
+xlabel('log2(Mean)');
+ylabel('log2(Dispersion)');
+legend('High','Low','Location','southwest');
+% compute the mean and the log2FC
+meanBase = (meanHigh + meanLow) / 2;
+foldChange = meanHigh ./ meanLow;
+log2FC = log2(foldChange);
+</syntaxhighlight>
+=== MA Plot ===
+<syntaxhighlight lang="matlab" line>
+%% MA Plot
+% plot mean vs. fold change (MA plot)
+mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true);
+set(get(gca,'Xlabel'),'String','mean of normalized counts')
+set(get(gca,'Ylabel'),'String','log2(fold change)')
+</syntaxhighlight>
+==== Interactive ====
+<syntaxhighlight lang="matlab" line>
+%% Interactive MA Plot
+mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA');
+</syntaxhighlight>
+=== Gene Table ===
+<syntaxhighlight lang="matlab" line>
+%% Create Gene Table
+% create table with statistics about each gene
+geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC);
+geneTable.Properties.RowNames = genData.Properties.RowNames;
+% summary
+summary(geneTable)
+% preview
+geneTable(1:10,:)
+</syntaxhighlight>
+=== Inferring Differential Expression with a Negative Binomial Model ===
+==== Constant Variance Link ====
+<syntaxhighlight lang="matlab" line>
+%% Constant Variance Link
+tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),...
+                  'VarianceLink','Constant');
+h = plotVarianceLink(tConstant, 'Compare', 0);
+% set custom title
+h(1).Title.String = 'Variance Link on Low Samples';
+h(2).Title.String = 'Variance Link on High Samples';
+% save
+saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png");
+saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png");
+</syntaxhighlight>
+==== Local Regression Variance Link ====
+<syntaxhighlight lang="matlab" line>
+%% Local Regression Variance Link
+tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression');
+h = plotVarianceLink(tLocal, 'Compare', 1);
+% set custom title
+h(1).Title.String = 'Variance Link on Low Samples';
+h(2).Title.String = 'Variance Link on High Samples';
+% save
+saveas(h(1),path_images_folder + "tLocal_treated_prostate.png");
+saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png");
+</syntaxhighlight>
+=== Histogram of P-values ===
+<syntaxhighlight lang="matlab" line>
+%% Histogram of P-values
+h = figure;
+histogram(tLocal.pValue,100)
+xlabel('P-value');
+ylabel('Frequency');
+title('P-value enrichment')
+saveas(h, path_images_folder + "p-values_histogram_gleason.png");
+</syntaxhighlight>
+==== Histogram of P-values without low count genes ====
+<syntaxhighlight lang="matlab" line>
+%%% Histogram of P-values without low count genes
+h = figure;
+lowCountThreshold = 10;
+lowCountGenes = all(counts < lowCountThreshold, 2);
+histogram(tLocal.pValue(~lowCountGenes),100)
+xlabel('P-value');
+ylabel('Frequency');
+title('P-value enrichment without low count genes')
+saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png");
+</syntaxhighlight>
+=== Multiple testing adjusted P-values ===
+<syntaxhighlight lang="matlab" line>
+%% Multiple testing adjusted P-values
+% compute the adjusted P-values (BH correction)
+padj = mafdr(tLocal.pValue,'BHFDR',true);
+% add to the existing table
+geneTable.pvalue = tLocal.pValue;
+geneTable.padj = padj;
+% create a table with significant genes using adjusted p-values
+sig = geneTable.padj < 0.1;
+geneTableSig = geneTable(sig,:);
+geneTableSig = sortrows(geneTableSig,'padj');
+numberSigGenes = size(geneTableSig,1);
+fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures);
+</syntaxhighlight>
+=== Identifying the Most Up-regulated and Down-regulated Genes ===
+<syntaxhighlight lang="matlab" line>
+%% Identifying the Most Up-regulated and Down-regulated Genes
+% find up-regulated genes
+up = geneTableSig.log2FC > 1;
+upGenes = sortrows(geneTableSig(up,:),'log2FC','descend');
+numberSigGenesUp = sum(up);
+fprintf('There are %d Up-regulated genes\n', numberSigGenesUp);
+% find down-regulated genes
+down = geneTableSig.log2FC < -1;
+downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend');
+numberSigGenesDown = sum(down);
+fprintf('There are %d Down-regulated genes\n', numberSigGenesUp);
+</syntaxhighlight>
+=== Plot FC vs Mean ===
+<syntaxhighlight lang="matlab" line>
+%% Plot FC vs Mean
+% A good visualization of the gene expressions and their significance is given by
+% plotting the fold change versus the mean in log scale and coloring the data points according
+% to the adjusted P-values.
+figure
+scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o')
+colormap(flipud(cool(256)));
+colorbar;
+ylabel('log2(Fold Change)');
+xlabel('log2(Mean of normalized counts)');
+title('Fold change by FDR')
+% You can see here that for weakly expressed genes (i.e. those with low means),
+% the FDR is generally high because low read counts are dominated by Poisson noise
+% and consequently any biological variability is drowned in the uncertainties from the read counting.
+</syntaxhighlight>