RNASeq MATLAB

Da Bioingegneria Elettronica e Informatica.

Nicola Altini nicola.altini@poliba.it

Simona De Summa s.desumma@oncologico.bari.it

Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it

Slides

Example: lncRNAs in prostate cancer in relation to Gleason score

  1. %% lncRNAs in prostate cancer in relation to Gleason score
  2. %%% Read Input Data
  3. path_images_folder = "MATLAB/RNASeq/images/";
  4. % Loading read counts data
  5. filenameGenData = "matrix_seminario.csv";
  6. genData = readtable(filenameGenData, 'ReadRowNames', 1, 'ReadVariableNames', 1);
  7. % Loading clinical data
  8. filenameClinicalData = "design_gleason.csv";
  9. clinicalData = readtable(filenameClinicalData);
  10.  
  11. % Count Features and Samples
  12. nFeatures = size(genData, 1);
  13. nPatients = size(genData, 2);
  14. fprintf("There are %d features and %d samples\n", nFeatures, nPatients);

Labels Assignment

  1. %%% Labels
  2. % Create label vectors for high/low gleason score patients
  3. labels_low = [];
  4. labels_high = [];
  5. for i = 1:nPatients
  6.     label = clinicalData{i,3}{1};
  7.     if label == "low"
  8.         labels_low = [labels_low, i];
  9.     else
  10.         labels_high = [labels_high, i];
  11.     end
  12. end
  13. nLows = numel(labels_low);
  14. nHighs = numel(labels_high);
  15. fprintf("There are:\n%d patients with low gleason\n%d patients with high gleason\n", nLows, nHighs);

Dimensionality Reduction

PCA

  1. %% PCA
  2. genDataArray = table2array(genData);
  3. genDataArrayT = genDataArray';
  4. genDataArrayNormT = log2(genDataArrayT + 1);
  5. [coeff,score,latent,tsquared,explained,mu] = pca(genDataArrayNormT, 'Centered', 0);
  6.  
  7. %%% Plot PC1 + PC2
  8. figure
  9. scatter(coeff(labels_low,1), coeff(labels_low,2), 1, 'black');
  10. axis([-0.5 0.5 -0.5 0.5])
  11. hold on 
  12. scatter(coeff(labels_high,1), coeff(labels_high,2), 1, 'red');

t-SNE

  1. %% tSNE
  2. nDim = 3;
  3. genDataEmbedding = tsne(genDataArrayT, 'Algorithm', 'exact', 'NumDimensions', nDim);
  4. figure
  5. if nDim == 2
  6.     scatter(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), 3, 'black');
  7.     hold on
  8.     scatter(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), 3, 'red');
  9. elseif  nDim == 3
  10.     scatter3(genDataEmbedding(labels_low,1), genDataEmbedding(labels_low,2), ...
  11.              genDataEmbedding(labels_low,3), 3, 'black');
  12.     hold on
  13.     scatter3(genDataEmbedding(labels_high,1), genDataEmbedding(labels_high,2), ...
  14.              genDataEmbedding(labels_high,3), 3, 'red');
  15. end

Normalizing Read Counts

  1. %% Normalizing Read Counts
  2. counts = genDataArray;
  3.  
  4. % estimate pseudo-reference with geometric mean row by row
  5. pseudoRefSample = geomean(counts,2);
  6. nz = pseudoRefSample > 0;
  7. ratios = bsxfun(@rdivide,counts(nz,:),pseudoRefSample(nz));
  8. sizeFactors = median(ratios,1);
  9.  
  10. % transform to common scale
  11. normCounts = bsxfun(@rdivide,counts,sizeFactors);
  12. normCounts(1:10,:);
  13.  
  14. %% Boxplots
  15. figure;
  16.  
  17. subplot(2,1,1)
  18. maboxplot(log2(counts(:,1:4)),'title','Raw Read Count','orientation','horizontal')
  19. ylabel('sample');
  20. xlabel('log2(counts)');
  21.  
  22. subplot(2,1,2)
  23. maboxplot(log2(normCounts(:,1:4)),'title','Normalized Read Count','orientation','horizontal')
  24. ylabel('sample');
  25. xlabel('log2(counts)');

Computing Mean, Dispersion and Fold Change

  1. %% Computing Mean, Dispersion and Fold Change
  2. % consider the mean
  3. % For each feature, we compute the mean across all the patients with
  4. % high/low gleason
  5. meanHigh = mean(normCounts(:,labels_high),2);
  6. meanLow = mean(normCounts(:,labels_low),2);
  7.  
  8. % consider the dispersion
  9. dispHigh = std(normCounts(:,labels_high),0,2) ./ meanHigh;
  10. dispLow = std(normCounts(:,labels_low),0,2) ./ meanLow;
  11.  
  12. % plot on a log-log scale
  13. figure;
  14. loglog(meanHigh,dispHigh,'r.');
  15. hold on;
  16. loglog(meanLow,dispLow,'b.');
  17. xlabel('log2(Mean)');
  18. ylabel('log2(Dispersion)');
  19. legend('High','Low','Location','southwest');
  20.  
  21. % compute the mean and the log2FC
  22. meanBase = (meanHigh + meanLow) / 2;
  23. foldChange = meanHigh ./ meanLow;
  24. log2FC = log2(foldChange);

MA Plot

  1. %% MA Plot
  2. % plot mean vs. fold change (MA plot)
  3. mairplot(meanHigh, meanLow,'Type','MA','Plotonly',true);
  4. set(get(gca,'Xlabel'),'String','mean of normalized counts')
  5. set(get(gca,'Ylabel'),'String','log2(fold change)')

Interactive

  1. %% Interactive MA Plot
  2. mairplot(meanHigh,meanLow,'Labels',genData.Properties.RowNames,'Type','MA');

Gene Table

  1. %% Create Gene Table
  2. % create table with statistics about each gene
  3. geneTable = table(meanBase,meanHigh,meanLow,foldChange,log2FC);
  4. geneTable.Properties.RowNames = genData.Properties.RowNames;
  5. % summary
  6. summary(geneTable)
  7.  
  8. % preview
  9. geneTable(1:10,:)

Inferring Differential Expression with a Negative Binomial Model

Constant Variance Link

  1. %% Constant Variance Link  
  2. tConstant = nbintest(counts(:,labels_low),counts(:,labels_high),...
  3.                   'VarianceLink','Constant');
  4. h = plotVarianceLink(tConstant, 'Compare', 0);
  5. % set custom title
  6. h(1).Title.String = 'Variance Link on Low Samples';
  7. h(2).Title.String = 'Variance Link on High Samples';
  8. % save
  9. saveas(h(1),path_images_folder + "tConstant_treated_0_prostate.png");
  10. saveas(h(2),path_images_folder + "tConstant_untreated_0_prostate.png");

Local Regression Variance Link

  1. %% Local Regression Variance Link
  2. tLocal = nbintest(counts(:,labels_low),counts(:,labels_high),'VarianceLink','LocalRegression');
  3. h = plotVarianceLink(tLocal, 'Compare', 1);
  4. % set custom title
  5. h(1).Title.String = 'Variance Link on Low Samples';
  6. h(2).Title.String = 'Variance Link on High Samples';
  7. % save
  8. saveas(h(1),path_images_folder + "tLocal_treated_prostate.png");
  9. saveas(h(2),path_images_folder + "tLocal_untreated_prostate.png");

Histogram of P-values

  1. %% Histogram of P-values
  2. h = figure;
  3. histogram(tLocal.pValue,100)
  4. xlabel('P-value');
  5. ylabel('Frequency');
  6. title('P-value enrichment')
  7. saveas(h, path_images_folder + "p-values_histogram_gleason.png");

Histogram of P-values without low count genes

  1. %%% Histogram of P-values without low count genes
  2. h = figure;
  3. lowCountThreshold = 10;
  4. lowCountGenes = all(counts < lowCountThreshold, 2);
  5. histogram(tLocal.pValue(~lowCountGenes),100)
  6. xlabel('P-value');
  7. ylabel('Frequency');
  8. title('P-value enrichment without low count genes')
  9. saveas(h, path_images_folder + "p-values_histogram_without_low_count_genes_gleason.png");

Multiple testing adjusted P-values

  1. %% Multiple testing adjusted P-values
  2. % compute the adjusted P-values (BH correction)
  3. padj = mafdr(tLocal.pValue,'BHFDR',true);
  4.  
  5. % add to the existing table
  6. geneTable.pvalue = tLocal.pValue;
  7. geneTable.padj = padj;
  8.  
  9. % create a table with significant genes using adjusted p-values
  10. sig = geneTable.padj < 0.1;
  11. geneTableSig = geneTable(sig,:);
  12. geneTableSig = sortrows(geneTableSig,'padj');
  13. numberSigGenes = size(geneTableSig,1);
  14.  
  15. fprintf("There are %d significant genes on %d\n", numberSigGenes, nFeatures);

Identifying the Most Up-regulated and Down-regulated Genes

  1. %% Identifying the Most Up-regulated and Down-regulated Genes
  2. % find up-regulated genes
  3. up = geneTableSig.log2FC > 1;
  4. upGenes = sortrows(geneTableSig(up,:),'log2FC','descend');
  5. numberSigGenesUp = sum(up);
  6. fprintf('There are %d Up-regulated genes\n', numberSigGenesUp);
  7.  
  8. % find down-regulated genes
  9. down = geneTableSig.log2FC < -1;
  10. downGenes = sortrows(geneTableSig(down,:),'log2FC','ascend');
  11. numberSigGenesDown = sum(down);
  12. fprintf('There are %d Down-regulated genes\n', numberSigGenesUp);

Plot FC vs Mean

  1. %% Plot FC vs Mean
  2. % A good visualization of the gene expressions and their significance is given by 
  3. % plotting the fold change versus the mean in log scale and coloring the data points according 
  4. % to the adjusted P-values.
  5.  
  6. figure
  7. scatter(log2(geneTable.meanBase),geneTable.log2FC,20,geneTable.padj,'o')
  8. colormap(flipud(cool(256)));
  9. colorbar;
  10. ylabel('log2(Fold Change)');
  11. xlabel('log2(Mean of normalized counts)');
  12. title('Fold change by FDR')
  13.  
  14. % You can see here that for weakly expressed genes (i.e. those with low means), 
  15. % the FDR is generally high because low read counts are dominated by Poisson noise
  16. % and consequently any biological variability is drowned in the uncertainties from the read counting.