Thursday, January 24, 2013

Retrieve SNPs in Promoters and Flanking Sequence for Many Genes

This post presents R code to retrieve SNPs in promoters for a list of genes.  You provide your list of genes to the "gene" variable and then use biomaRt (mus_musculus) to get the transcriptional start sites (TSS) for each transcript for your list of genes.  The code then uses the TSS info to search in promoter regions (defined here as [-1000, +200 bp] relative to TSS) for SNPs for each gene using biomaRt - my code looks for SNPs distinguishing C57BL/6J and CastEiJ mouse strains, but this can be easily altered in the getSnps function by changing "sp$cast_eij" to another strain name (use listAttributes(snpmart) in biomaRt).

You can view the promoters found to have SNPs in the "dataSnps" variable.   Select the transcript SNP set you are interested in and run the final "getflank" function to get the surrounding sequence information using the mouse genome data provided in the BSgenome library.  The code finds 100 bp of flanking sequence around each SNP site and this can adjust with the "offset" variable in "getflank" function.  The "final" output provides row names and writes a tab delimited file that opens easily in Excel.

This is a quick way to get SNP data in promoter regions for genes of interest.


library(biomaRt)
library(BSgenome.Mmusculus.UCSC.mm10)
setwd("")

#######################INPUT GENES OF INTEREST####
genes = c("Igf2","H19", "Igf2R", "Rasgrf", "Magel2")


#######################GET TSS#############################
#get transcriptional start sites for all genes of interest

ensembl = useMart("ensembl")
ensembl = useDataset("mmusculus_gene_ensembl", mart=ensembl)
tss <- getBM(attributes=c('ensembl_gene_id', 'ensembl_transcript_id', 'chromosome_name','transcript_start', 'transcript_end','mgi_symbol'), filters = c("mgi_symbol"), values=genes, mart=ensembl)

########################GET SNPS IN PROMOTER###########################
#get snps in promoter region : promoter region is defined as [-1000,+200] relative to transcriptional start site

snpmart = useMart("snp")
snpmart = useDataset("mmusculus_snp", mart=snpmart)

getSnps <- function(x){
    txstart = as.numeric(x[4])
    txend = as.numeric(x[5])
    id = x[1]
    txid = x[2]
    name = x[6]    
    chr = x[3]
    if ( txstart > txend ) {
        getBM(attributes=c('refsnp_id', 'chr_name', 'chrom_start', 'allele', 'c57bl_6j', 'cast_eij'), filters=c("chr_name", "chrom_start", "chrom_end"), values=list( chr, (txstart - 200), (txstart + 1000)), mart = snpmart)->sp
        sp[sp$cast_eij %in% c("A","T","G","C"), ]->dataSnps

        }    else if ( txstart < txend ) {    
        getBM(attributes=c('refsnp_id', 'chr_name', 'chrom_start', 'allele', 'c57bl_6j', 'cast_eij'), filters=c("chr_name", "chrom_start", "chrom_end"), values=list( 2, ( txstart - 1000 ), ( txstart +200 )), mart = snpmart)->sp
        sp[sp$cast_eij %in% c("A","T","G","C"), ]->dataSnps
        }
    if ( nrow(dataSnps) > 1 ) {
    cbind(id,txid,name,dataSnps, txstart)->results
    return(results)
    } 
}
    
snps<-apply(tss, 1, getSnps)
dataSnps <- snps[!sapply(snps, is.null)]#All SNP Info with NULLS removed

###########################GET FLANKING SEQUENCE FOR EACH SNP###########################
#Get sequence for snps

dataSnps #view output 

dataSnps[[19]] -> d #select SNP set for transcript of interest

getflank <- function(x) {
    id = x[1]
    txid = x[2]
    name = x[3]
    position = as.numeric(x[6])
    alleles = paste("[",x[7]," > ",x[9],"]", sep="")
    chr = paste("chr", x[5], sep="")
    txstart = x[10]
    offset = 100
    leftflank  <- getSeq(BSgenome.Mmusculus.UCSC.mm10,chr,position-offset,position-1)
    rightflank <- getSeq(BSgenome.Mmusculus.UCSC.mm10,chr,position+1,position+offset)
    paste(leftflank,alleles,rightflank,sep="")->seq
    cbind(id, txid, name, txstart, position, alleles, chr, seq)->out
    return(out)
}


final <- apply(d, 1, getflank)
r<- c("id","txid", "name", "txstart", "snp position", "Ref > Alt", "chr", "Seq")
rownames(final) <- r
write.table(final, file = "SNPSandFlankingSEQinPROMOTERS_Gene.txt", sep="\t")