source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
library(biomaRt)
ensembl <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
ensembl_mapping <- getBM(attributes = c("ensembl_gene_id", "uniprot_genename", "uniprot_swissprot_accession", "hgnc_symbol"), mart = ensembl)
colnames(ensembl_mapping) <- c("ensembl_id", "uniprot_genename", "uniprot_accession", "gene_symbol")
uniprot <- useMart(biomart = "unimart", dataset = "uniprot")
uniprot_mapping <- getBM(attributes = c("accession", "gene_name", "ensembl_id"), mart = uniprot)
colnames(uniprot_mapping) <- c("uniprot_accession", "gene_symbol", "ensembl_id")

print_string <- function(string){
    write(string, file = "") }

compare_sets <- function(set1, set2){
    uset1 <- unique(set1)
    uset2 <- unique(set2)
    print_string(sprintf("Set1: %d elements, %d unique.", length(set1), length(uset1)))
    print_string(sprintf("Set2: %d elements, %d unique.", length(set2), length(uset2)))
    print_string(sprintf("Intersection: %d elements.", length(intersect(uset1, uset2))))
    print_string(sprintf("Set1 - Set2: %d elements.", length(setdiff(uset1, uset2))))
    print_string(sprintf("Set2 - Set1: %d elements.", length(setdiff(uset2, uset1))))

}



convert_ensembl_to_genesymbol <- function(ensembl_gene_id){
    flag <- gene_names$ensembl_gene_id == ensembl_gene_id
    if(sum(flag) != 1){error("ensembl gene id found zero or more than one times.")}
    gene_names$hgnc_symbol[flag] }

convert_ensembl_to_genesymbol_vector <- function(ensembl_ids){
    ensembl <- data.frame(ensembl_gene_id = ensembl_ids)
    merge(ensembl, gene_names, by = "ensembl_gene_id",
          all.x = TRUE, all.y = FALSE) }

write_all_gene_names <- function(){
    write.csv(gene_names, file = "~/Dropbox/genenet/data/gene_symbol_mapping.csv",
              row.names = FALSE, quote = FALSE) }