library(XML)
library(rentrez)
library(ape)
library(phangorn)
## Warning: closing unused connection 5
## (ray2000_edited.fas.mapped.degapped.aa)
source("utils.R")
You also need to set your email address.
entrez_email <- "sdwfrost@gmail.com"
If we have a list of accessions, we can post these to the nucleotide database, known as nuccore
, and then get the sequences back in one of several formats.
You can find the list of search terms using entrez_db_searchable
.
search_fields <- entrez_db_searchable("nuccore")
search_fields
## Searchable fields for database 'nuccore'
## [1] ALL UID FILT WORD TITL KYWD AUTH JOUR VOL ISS PAGE ORGN ACCN PACC
## [15] GENE PROT ECNO PDAT MDAT SUBS PROP SQID GPRJ SLEN FKEY PORG COMP ASSM
## [29] DIV STRN ISOL CULT BRD BIOS
myaccs <- c("AF271819", "AF271820", "AF271821", "AF271823", "AF271824",
"AF271822", "AF271818", "AF271817", "AF271876", "AF271875", "AF271882",
"AF271879", "AF271878", "AF271877", "AF271887", "AF271881", "AF271880",
"AF271874", "AF271883", "AF271885", "AF271886", "AF271884", "AF271847",
"AF271851", "AF271866", "AF271839", "AF271841", "AF271849", "AF271873",
"AF271843", "AF271858", "AF271867", "AF271855", "AF271872", "AF271831",
"AF271825", "AF271863", "AF271835", "AF271845", "AF271859", "AF271860",
"AF271836", "AF271832", "AF271827", "AF271834", "AF271854", "AF271828",
"AF271850", "AF271853", "AF271848", "AF271857", "AF271829", "AF271865",
"AF271838", "AF271856", "AF271852", "AF271864", "AF271862", "AF271842",
"AF271830", "AF271871", "AF271869", "AF271833", "AF271844", "AF271870",
"AF271861", "AF271868", "AF271826", "AF271837", "AF271846", "AF271840"
)
myids <- accToIds(myaccs)
mypost <- entrez_post(db="nuccore",id=myids)
myseqs <- entrez_fetch(db="nuccore", ids="", rettype="fasta", WebEnv=mypost$WebEnv, query_key=mypost$QueryKey)
cat(myseqs,file="query.fas")
We search PubMed for the paper identifier.
ray2000 <- entrez_search(db="pubmed",term="ray[au] and hcv and egypt and 2000", retmax=10,usehistory=TRUE)
ray2000
## Entrez search result with 1 hits (object contains 1 IDs and a cookie)
We can now get the data in PubMed on this - very useful for literature reviews!
ray2000.pubmed <- entrez_fetch(db="pubmed",id=ray2000$ids,rettype="xml")
ray2000.link <- entrez_link(db="nuccore",dbfrom="pubmed",linkname="pubmed_nuccore",id=ray2000$ids)
ray2000.ids <- ray2000.link$pubmed_nuccore
ray2000.post <- entrez_post(db="nuccore",id=ray2000.ids)
## Error in function (type, msg, asError = TRUE) : Recv failure: Connection reset by peer
ray2000.seqs <- entrez_fetch(db="nuccore", ids="", rettype="fasta", WebEnv=ray2000.post$WebEnv, query_key=ray2000.post$QueryKey)
## Error in make_entrez_query("efetch", db = db, rettype = rettype, config = config, : object 'ray2000.post' not found
ray2000.xml <- entrez_fetch(db="nuccore", ids="", rettype="xml", WebEnv=ray2000.post$WebEnv, query_key=ray2000.post$QueryKey)
## Error in make_entrez_query("efetch", db = db, rettype = rettype, config = config, : object 'ray2000.post' not found
write(ray2000.seqs,"ray2000.fas")
## Error in cat(x, file = file, sep = c(rep.int(sep, ncolumns - 1), "\n"), : object 'ray2000.seqs' not found
To get all genomes manually we:
tax <- entrez_search(db="taxonomy",term="Hepatitis C", retmax=10,usehistory=TRUE)
tax
## Entrez search result with 1 hits (object contains 1 IDs and a cookie)
tax$ids
## [1] "11103"
genome <- entrez_search(db="genome",term=paste("txid",tax$ids,"[Orgn]",sep=""))
genome
## Entrez search result with 1 hits (object contains 1 IDs and no cookie)
There is a lot of information associated with the genome, that we can obtain using entrez_summary
.
genome.summary <- entrez_summary(db="genome",id=genome$ids)
genome.summary
## esummary result with 21 items:
## [1] uid organism_name organism_kingdom
## [4] organism_group organism_subgroup defline
## [7] projectid project_accession status
## [10] number_of_chromosomes number_of_plasmids number_of_organelles
## [13] assembly_name assembly_accession assemblyid
## [16] create_date options weight
## [19] chromosome_assemblies scaffold_assemblies sra_genomes
To cross reference databases - like clicking a link in the table - we use entrez_link
.
link <- entrez_link(db="nuccore",dbfrom="genome",linkname="genome_nuccore_samespecies",id=genome$ids)
link
## elink result with ids from 1 databases:
## [1] genome_nuccore_samespecies
seq.ids <- link$genome_nuccore_samespecies
length(seq.ids)
## [1] 1430
Now that we have the sequence identifiers we need, we can download the sequences. I'll just get 100 genomes. Firstly, we post the IDs to the database using entrez_post
.
seqs.post <- entrez_post(db="nuccore",id=seq.ids[1:100])
Next, we use entrez_fetch
to fetch the sequences.
seqs.fasta <- entrez_fetch(db="nuccore", ids="", rettype="fasta", WebEnv=seqs.post$WebEnv, query_key=seqs.post$QueryKey, retend=100)
## Error in function (type, msg, asError = TRUE) : Recv failure: Connection reset by peer
write(seqs.fasta,"hcv_genomes.fas")