You can view the promoters found to have SNPs in the "dataSnps" variable. Select the transcript SNP set you are interested in and run the final "getflank" function to get the surrounding sequence information using the mouse genome data provided in the BSgenome library. The code finds 100 bp of flanking sequence around each SNP site and this can adjust with the "offset" variable in "getflank" function. The "final" output provides row names and writes a tab delimited file that opens easily in Excel.
This is a quick way to get SNP data in promoter regions for genes of interest.
library(biomaRt)
library(BSgenome.Mmusculus.UCSC.mm10)
setwd("")
#######################INPUT GENES OF INTEREST####
genes = c("Igf2","H19", "Igf2R", "Rasgrf", "Magel2")
#######################GET TSS#############################
#get transcriptional start sites for all genes of interest
ensembl = useMart("ensembl")
ensembl = useDataset("mmusculus_gene_ensembl", mart=ensembl)
tss <- getBM(attributes=c('ensembl_gene_id', 'ensembl_transcript_id', 'chromosome_name','transcript_start', 'transcript_end','mgi_symbol'), filters = c("mgi_symbol"), values=genes, mart=ensembl)
########################GET SNPS IN PROMOTER###########################
#get snps in promoter region : promoter region is defined as [-1000,+200] relative to transcriptional start site
snpmart = useMart("snp")
snpmart = useDataset("mmusculus_snp", mart=snpmart)
getSnps <- function(x){
txstart = as.numeric(x[4])
txend = as.numeric(x[5])
id = x[1]
txid = x[2]
name = x[6]
chr = x[3]
if ( txstart > txend ) {
getBM(attributes=c('refsnp_id', 'chr_name', 'chrom_start', 'allele', 'c57bl_6j', 'cast_eij'), filters=c("chr_name", "chrom_start", "chrom_end"), values=list( chr, (txstart - 200), (txstart + 1000)), mart = snpmart)->sp
sp[sp$cast_eij %in% c("A","T","G","C"), ]->dataSnps
} else if ( txstart < txend ) {
getBM(attributes=c('refsnp_id', 'chr_name', 'chrom_start', 'allele', 'c57bl_6j', 'cast_eij'), filters=c("chr_name", "chrom_start", "chrom_end"), values=list( 2, ( txstart - 1000 ), ( txstart +200 )), mart = snpmart)->sp
sp[sp$cast_eij %in% c("A","T","G","C"), ]->dataSnps
}
if ( nrow(dataSnps) > 1 ) {
cbind(id,txid,name,dataSnps, txstart)->results
return(results)
}
}
snps<-apply(tss, 1, getSnps)
dataSnps <- snps[!sapply(snps, is.null)]#All SNP Info with NULLS removed
###########################GET FLANKING SEQUENCE FOR EACH SNP###########################
#Get sequence for snps
dataSnps #view output
dataSnps[[19]] -> d #select SNP set for transcript of interest
getflank <- function(x) {
id = x[1]
txid = x[2]
name = x[3]
position = as.numeric(x[6])
alleles = paste("[",x[7]," > ",x[9],"]", sep="")
chr = paste("chr", x[5], sep="")
txstart = x[10]
offset = 100
leftflank <- getSeq(BSgenome.Mmusculus.UCSC.mm10,chr,position-offset,position-1)
rightflank <- getSeq(BSgenome.Mmusculus.UCSC.mm10,chr,position+1,position+offset)
paste(leftflank,alleles,rightflank,sep="")->seq
cbind(id, txid, name, txstart, position, alleles, chr, seq)->out
return(out)
}
final <- apply(d, 1, getflank)
r<- c("id","txid", "name", "txstart", "snp position", "Ref > Alt", "chr", "Seq")
rownames(final) <- r
write.table(final, file = "SNPSandFlankingSEQinPROMOTERS_Gene.txt", sep="\t")