Analysis of spatially resolved RNA-seq data

Similarly to what shown analyzing ATAC data, we retrieved fastq from the paper “Spatial epigenome–transcriptome co-profiling of mammalian tissues” (Zhang et al, 2023, DOI:10.1038/s41586-023-05795-1), this time RNA sequences (SRA run ID: SRR22561636) and processed using kb_python. The resulting count matrix file is imported and processed as standard scRNA-seq data

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import *

import anndata as ad
import scanpy as sc

import scipy.sparse as ssp
import scipy.stats as sst

import spatialdata_io
import spatialdata_plot
import spatialdata as sd

import squidpy as sq
import schist as scs
import warnings
warnings.filterwarnings('ignore')
#import magic
from tqdm import tqdm

def set_res(high=True):
        dpi=80
        if high:
                dpi=150
        sc.set_figure_params(dpi=dpi, fontsize=6)
        rcParams['axes.grid'] = False

set_res(False)

First we import the .h5ad file computed by kb_python. After that we perform some QC and filtering, as proposed by sc-best-practices.

adata = sc.read_h5ad("SRR22561636/counts_unfiltered/adata.h5ad")

adata.var["mt"] = adata.var_names.str.startswith("mt-")
adata.var["ribo"] = adata.var_names.str.startswith(("Rps", "Rpl"))
sc.pp.calculate_qc_metrics(
        adata, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True
)

def is_outlier(adata, metric: str, nmads: int):
        M = adata.obs[metric]
        outlier = (M < np.median(M) - nmads * sst.median_abs_deviation(M)) | (
                np.median(M) + nmads * sst.median_abs_deviation(M) < M
        )
        return outlier

adata.obs["outlier"] = (
        is_outlier(adata, "log1p_total_counts", 5)
        | is_outlier(adata, "log1p_n_genes_by_counts", 5)
        | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)


adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
        adata.obs["pct_counts_mt"] > 8
)

In the end we retain most of the pixels

print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

Total number of cells: 2500
Number of cells after filtering of low quality cells: 2157

We select highly variable genes and then perform normalization (PFlog1pPF). Lastly, after PCA is computed, we save the anndata to build later the spatialdata object.

sc.pp.highly_variable_genes(adata,  flavor='seurat_v3_paper')

pf = adata.X.sum(axis=1).A.ravel()
l1pf = np.log1p(ssp.diags(pf.mean()/pf)@adata.X)
pf = l1pf.sum(axis=1).A.ravel()
adata.X = ssp.diags(pf.mean()/pf)@l1pf

sc.tl.pca(adata, use_highly_variable=True)
adata.write("analysis_rna/rna.h5ad")

Here the anndata is imported using the DBiT-seq plugin for spatialdata. Note that we had to rotate the original image from the paper 90 degrees CCW, as the current version of the plugin orders the barcodes differently from what is displayed in the original paper.

spdata = spatialdata_io.readers.dbit.dbit(path='analysis_rna',
                                     anndata_path='analysis_rna/rna.h5ad',
                                     barcode_position='barcodes.txt',
                                     image_path='ME13_50um_spatial/tissue_hires_image.png',
                                     dataset_id='ME13_50um_spatial')

We can visualize a QC value (number of genes) in its context

We create the kNN graph using the PCA embedding, then we apply schist to find the hierarchical cell structure

basis='pca'
sc.settings.verbosity=2
scs.inference.fit_model(spdata.table,
                                                   neighbors_key=f'{basis}_neighbors')
sc.settings.verbosity=0

minimizing the nested Stochastic Block Model
        minimization step done (0:01:14)
        consensus step done (0:01:25)
    done (0:01:25)
    finished (0:01:25)

sc.pl.umap(spdata.table, color=['nsbm_level_1', 'nsbm_level_2'])

Finally we can check how clusters distribute in their spatial context. At level 1 we can distinguish certain clusters that belong to the neual tissues. Here the results appear to be in line with the publication

set_res(True)
spdata.pl.render_images().pl.render_shapes(color='nsbm_level_1', fill_alpha=.7).pl.show(title='Level 1', colorbar=True)
xticks([])
yticks([])
plt.xlabel('A barcodes')
plt.ylabel('B barcodes')

Similarly to what has been done for ATAC, we perform the analysis of a multimodal data, where one modality is represented by the spatial graph

sq.gr.spatial_neighbors(spdata.table, n_neighs=8, coord_type='grid')

_tmp = spdata.table.copy()
sc.settings.verbosity=2
scs.inference.fit_model_multi([spdata.table, _tmp],
                              key_added='spt',
                              neighbors_key=['pca_neighbors', 'spatial_neighbors'])
sc.settings.verbosity=0

minimizing the nested Stochastic Block Model
getting adjacency for data 0 (0:00:00)
getting adjacency for data 1 (0:00:00)
        minimization step done (0:04:11)
        consensus step done (0:04:21)
    done (0:04:21)
    finished (0:04:22)

Again, we visualize the structured data, with a better resolution of anatomical structures

set_res(True)
spdata.pl.render_images().pl.render_shapes(color='spt_level_1', fill_alpha=.7).pl.show(title='SPT_level_1', colorbar=True)
xticks([])
yticks([])
plt.xlabel('A barcodes')
plt.ylabel('B barcodes')

Again, we save data for later use in integrated analysis.

spdata.write('analysis/SRR22561636.zarr')