source("http://bioconductor.org/biocLite.R") biocLite("biomaRt") library(biomaRt) ensembl <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl") ensembl_mapping <- getBM(attributes = c("ensembl_gene_id", "uniprot_genename", "uniprot_swissprot_accession", "hgnc_symbol"), mart = ensembl) colnames(ensembl_mapping) <- c("ensembl_id", "uniprot_genename", "uniprot_accession", "gene_symbol") uniprot <- useMart(biomart = "unimart", dataset = "uniprot") uniprot_mapping <- getBM(attributes = c("accession", "gene_name", "ensembl_id"), mart = uniprot) colnames(uniprot_mapping) <- c("uniprot_accession", "gene_symbol", "ensembl_id") print_string <- function(string){ write(string, file = "") } compare_sets <- function(set1, set2){ uset1 <- unique(set1) uset2 <- unique(set2) print_string(sprintf("Set1: %d elements, %d unique.", length(set1), length(uset1))) print_string(sprintf("Set2: %d elements, %d unique.", length(set2), length(uset2))) print_string(sprintf("Intersection: %d elements.", length(intersect(uset1, uset2)))) print_string(sprintf("Set1 - Set2: %d elements.", length(setdiff(uset1, uset2)))) print_string(sprintf("Set2 - Set1: %d elements.", length(setdiff(uset2, uset1)))) } convert_ensembl_to_genesymbol <- function(ensembl_gene_id){ flag <- gene_names$ensembl_gene_id == ensembl_gene_id if(sum(flag) != 1){error("ensembl gene id found zero or more than one times.")} gene_names$hgnc_symbol[flag] } convert_ensembl_to_genesymbol_vector <- function(ensembl_ids){ ensembl <- data.frame(ensembl_gene_id = ensembl_ids) merge(ensembl, gene_names, by = "ensembl_gene_id", all.x = TRUE, all.y = FALSE) } write_all_gene_names <- function(){ write.csv(gene_names, file = "~/Dropbox/genenet/data/gene_symbol_mapping.csv", row.names = FALSE, quote = FALSE) }