Querying SRA site for info

The following script contains code to query SRA for SRA numbers from expeirment(SRX/ERX), sample(SRS/ERS), project(SRP/XRP), and BioProject (PRJNA) identifiers, here’s a link to download script here getSraRunsFromAccIds.py

The script contains a class with static methods to get the RUN ids associated with other identifiers and a function to automatically generate the URL for the run ID. The script can be included in another script to use the SRAUtils class and it’s functions or the script can be ran itself to get the ids and urls for several IDs

The script depends on the non-standard requests library which might need to be installed

pip install requests
Requirement already satisfied (use --upgrade to upgrade): requests in /usr/lib/python2.7/dist-packages
Cleaning up...

getSraRunsFromAccIds.py

#!/usr/bin/env python
import shutil, os, argparse, sys, stat
import requests
import csv, io
import traceback
import xmltodict, json

class SRAUtils:
    '''
    Prefixes can be found here http://www.ddbj.nig.ac.jp/sub/prefix.html for DDBJ, ENA/EBI (ERA), and NCBI(SRA)
    
    Some interesting information here http://trace.ddbj.nig.ac.jp/dra/submission_e.html#Organization_of_metadata_objects
    '''
    @staticmethod
    def getInfoTableFromSearchTerm(search):
        #return types and databases listed here
        #https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly
        #
        payload = {"save": "efetch","db": "sra","rettype" : "runinfo", "term" : search };
        r = requests.get('http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi', params=payload)
        if 200 ==  r.status_code:
            if r.text.isspace():
                raise Exception("Got blank string from " + str(r.url ))
            else:
                reader_list = csv.DictReader(io.StringIO(r.text))
                infoRows = []
                for row in reader_list:
                    infoRows.append(row)
                if 0 == len(infoRows):
                    raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(infoRows), search))
                else:        
                    return infoRows
                return infoRows
        else:
            raise Exception("Error in downloading from " + str(r.url) + " got response code " + str(r.status_code))
        
    @staticmethod
    def getDictTableFromSearchTerm(search):
        #return types and databases listed here
        #https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly
        #@todo need to add way to select only what is needed
        payload = {"save": "efetch","db": "sra","rettype" : "full", "term" : search };
        r = requests.get('http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi', params=payload)
        if 200 ==  r.status_code:
            if r.text.isspace():
                raise Exception("Got blank string from " + str(r.url ))
            else:
                return xmltodict.parse(r.text);
                """
                reader_list = csv.DictReader(io.StringIO())
                infoRows = []
                for row in reader_list:
                    infoRows.append(row)
                if 0 == len(infoRows):
                    raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(infoRows), search))
                else:        
                    return infoRows
                return infoRows
                """
        else:
            raise Exception("Error in downloading from " + str(r.url) + " got response code " + str(r.status_code))
        
        
    @staticmethod
    def getRunAccsFromInfoTable(infoTab):
        runInfo = []
        for row in infoTab:
            runInfo.append(str(row.get('Run')))
        return runInfo
            
        
    @staticmethod
    def getSraUrlFromRunAccession(accesion):
        if not type(accesion) is str:
            raise Exception("Error in getSraUrlFromRunAccession: accesion should be str, not " + str(type(accesion)))
        if len(accesion) < 7:
            raise Exception("Error in getSraUrlFromRunAccession: accession should be least 7 character long, not " + str(len(accesion)) + ", for " + str(accesion))
        
        if not accesion.startswith("ERR") and not accesion.startswith("SRR"):
            raise Exception("Error in getSraUrlFromRunAccession: accession should start with either ERR or SRR, not " + str(accesion[0:3]) )
        
        template = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/{PREFIX}/{PREFIX}{PREFIXNUMS}/{ACCESION}/{ACCESION}.sra"
        return template.format(PREFIX = accesion[0:3], PREFIXNUMS = accesion[3:6], ACCESION = accesion)
    
    @staticmethod
    def getInfoFromRunAcc(run):
        if not run.startswith("ERR") and not run.startswith("SRR"):
            raise Exception("run should start with ERR or SRR, not: " + run)
        infoTab = SRAUtils.getInfoTableFromSearchTerm(run)
        runInfo = []
        for row in infoTab:
            if run == str(row.get('Run')):
                runInfo.append(row)
        if 0 == len(runInfo):
            raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(runInfo), run))
        else:
            return runInfo

    @staticmethod
    def getInfoFromSubmissionAcc(submission):
        if not submission.startswith("ERA") and not submission.startswith("SRA"):
            raise Exception("submission should start with ERR or SRR, not: " + submission)
        infoTab = SRAUtils.getInfoTableFromSearchTerm(submission)
        runInfo = []
        for row in infoTab:
            if submission == str(row.get('Submission')):
                runInfo.append(row)
        if 0 == len(runInfo):
            raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(runInfo), submission))
        else:
            return runInfo    
        
    @staticmethod
    def getInfoFromSampleAcc(sample):
        if not sample.startswith("ERS") and not sample.startswith("SRS"):
            raise Exception("sample should start with ERS or SRS, not: " + sample)
        infoTab = SRAUtils.getInfoTableFromSearchTerm(sample)
        runInfo = []
        for row in infoTab:
            if sample == str(row.get('Sample')):
                runInfo.append(row)
        if 0 == len(runInfo):
            raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(runInfo), sample))
        else:
            return runInfo
        
    @staticmethod
    def getInfoFromProjectAcc(project):
        if not project.startswith("ERP") and not project.startswith("SRP"):
            raise Exception("project should start with ERP or SRP, not: " + project)
        infoTab = SRAUtils.getInfoTableFromSearchTerm(project)
        runInfo = []
        for row in infoTab:
            if project == str(row.get('SRAStudy')):
                runInfo.append(row)
        if 0 == len(runInfo):
            raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(runInfo), project))
        else:
            return runInfo
        
    @staticmethod
    def getInfoFromExperimentAcc(experiment):
        if not experiment.startswith("ERX") and not experiment.startswith("SRX"):
            raise Exception("experiment should start with ERX or SRX, not: " + experiment)
        infoTab = SRAUtils.getInfoTableFromSearchTerm(experiment)
        runInfo = []
        for row in infoTab:
            if experiment == str(row.get('Experiment')):
                runInfo.append(row)
        if 0 == len(runInfo):
            raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(runInfo), experiment))
        else:        
            return runInfo
    
    @staticmethod
    def getInfoFromBioProjectAcc(bioProject):
        # or 
        if not bioProject.startswith("PRJNA") and not bioProject.startswith("PRJEA") and not bioProject.startswith("PRJEB") and not bioProject.startswith("PRJDA"):
            raise Exception("bioProject should start with PRJNA, PRJEA, PRJEB, and PRJDA, not: " + bioProject)
        infoTab = SRAUtils.getInfoTableFromSearchTerm(bioProject)
        runInfo = []
        for row in infoTab:
            if bioProject == str(row.get('BioProject')):
                runInfo.append(row)
        if 0 == len(runInfo):
            raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(runInfo), bioProject))
        else:        
            return runInfo   
    
    @staticmethod
    def getInfoFromBioSampleAcc(bioSample):
        '''
        SAMD     DDBJ 
        SAME     ENA/EBI
        SAMN     NCBI 
        '''
        if not bioSample.startswith("SAME") and not bioSample.startswith("SAMD") and not bioSample.startswith("SAMN"):
            raise Exception("bioProject should start with SAME, SAMD, and SAMN, not: " + bioSample)
        infoTab = SRAUtils.getInfoTableFromSearchTerm(bioSample)
        runInfo = []
        for row in infoTab:
            if bioSample == str(row.get('BioSample')):
                runInfo.append(row)
        if 0 == len(runInfo):
            raise Exception('Found %d entries in SRA for "%s" when expecting at least 1' % (len(runInfo), bioSample))
        else:        
            return runInfo   
        
    
    
        
    @staticmethod
    def getInfoFromSRAIdentifier(identifier):
        if identifier.startswith("ERX") or identifier.startswith("SRX"):
            return SRAUtils.getInfoFromExperimentAcc(identifier)
        elif identifier.startswith("ERP") or identifier.startswith("SRP"):
            return SRAUtils.getInfoFromProjectAcc(identifier)
        elif identifier.startswith("ERS") or identifier.startswith("SRS"):
            return SRAUtils.getInfoFromSampleAcc(identifier)
        elif identifier.startswith("ERR") or identifier.startswith("SRR"):
            return SRAUtils.getInfoFromRunAcc(identifier)
        elif identifier.startswith("ERA") or identifier.startswith("SRA"):
            return SRAUtils.getInfoFromSubmissionAcc(identifier)
        elif identifier.startswith("SAME") or identifier.startswith("SAMD") or identifier.startswith("SAMN"):
            #SAME, SAMD, and SAMN
            return SRAUtils.getInfoFromBioSampleAcc(identifier)
        elif identifier.startswith("PRJDA"):
            #DDBJ archvie bioproject prefix PRJNA SAMEA2796165
            return SRAUtils.getInfoFromBioProjectAcc(identifier)
        elif identifier.startswith("PRJNA"):
            #short read archvie bioproject prefix PRJNA
            return SRAUtils.getInfoFromBioProjectAcc(identifier)
        elif identifier.startswith("PRJEA") or identifier.startswith("PRJEB"):
            #european archive bioproject prefixes PRJEA or PRJEB
            return SRAUtils.getInfoFromBioProjectAcc(identifier)
        else:
            raise Exception("Error, unrecognized prefix for sra Identifier " + str(identifier))


def parse_args_sraIdentifier():
    parser = argparse.ArgumentParser()
    parser.add_argument('--identifiers', type=str, help = "A list of comma separated SRA identifiers e.g. SRP046206,SRX188939,SRS807544,SRR1759594,PRJNA63661",  required = True)
    parser.add_argument('--outStub', type=str, help = "An output stub for info and sra urls output files",  required = True)
    parser.add_argument('--overWrite', action = "store_true", help = "Overwrite files if they already exist")
    return parser.parse_args()

def runGetRunsFromSampleAcc():
    args = parse_args_sraIdentifier()
    identifiers = args.identifiers.split(",")
    #sys.stdout.write(str("identifier") + "\t" + str("run") + "\t" + "url" + "\n")
    outUrlsFnp = args.outStub + "_urls.tab.txt"
    outInfoFnp = args.outStub + "_info.tab.txt"
    if os.path.exists(outUrlsFnp) and not args.overWrite:
        raise Exception("File " + outUrlsFnp + " already exists, use --overWrite to over write it")
    if os.path.exists(outInfoFnp) and not args.overWrite:
        raise Exception("File " + outInfoFnp + " already exists, use --overWrite to over write it")
    with open(outUrlsFnp, "w") as outUrlsFile:
        with open(outInfoFnp, "w") as outInfoFile:
            outUrlsFile.write(str("identifier") + "\t" + str("run") + "\t" + "url" + "\n")
            identifierCount = 0
            for identifier in identifiers:
                try:
                    tab = SRAUtils.getInfoFromSRAIdentifier(identifier)
                    if 0 == identifierCount:
                        outInfoFile.write("\t".join(tab[0].keys()) + "\n")
                    for row in tab:
                        outUrlsFile.write(str(identifier) + "\t" + row.get("Run") + "\t" + SRAUtils.getSraUrlFromRunAccession(str(row.get("Run"))) + "\n")
                        outInfoFile.write("\t".join(row.values()) + "\n")
                    identifierCount = identifierCount + 1
                except Exception, err:
                    print ("Failed  to get info for " + str(identifier) + ", mess: " + str(err))
                    traceback.print_exc()
                    

if __name__ == "__main__":
    runGetRunsFromSampleAcc()

Runing the script

./getSraRunsFromAccIds.py --identifiers SRP046206,SRX188939,SRS807544,SRR1759594 --outStub out

This will download two files, one will end with _info.tab.txt which has the whole table that querying the SRA website gets and the other one will be end with _urls.tab.txt which has the urls for all run files for the identifiers given

cat out_info.tab.txt
g1k_analysis_group  download_path   dbgap_study_accession   TaxID   InsertSize  LoadDate    Platform    Experiment  ScientificName  Study_Pubmed_id Sample  Submission  Run LibraryStrategy LibrarySource   LibraryName SRAStudy    spots   ReleaseDate source  bases   LibrarySelection    CenterName  BioSample   Histological_Type   Subject_ID  SampleType  Disease Analyte_Type    Tumor   ReadHash    avgLength   LibraryLayout   BioProject  AssemblyName    Model   Affection_Status    Body_Site   g1k_pop_code    ProjectID   Sex size_MB spots_with_mates    SampleName  Consent RunHash InsertDev
    https://sra-download.ncbi.nlm.nih.gov/srapub/SRR1565149     5855    300 2017-02-16 01:50:38 ILLUMINA    SRX692314   Plasmodium vivax        SRS694268   SRA181560   SRR1565149  WGS GENOMIC Pond-301146 SRP046206   10276   2014-09-04 21:55:05     2075752 Hybrid Selection    BI  SAMN02677080            simple          no  D8A72DD43D91DC19659A54842FC69E73    202 PAIRED  PRJNA240383 GCF_000002415.2 Illumina MiSeq              240383      2   10276   Plasmodium vivax Brazil08   public  B8213350745A193B08FCDF0F18E03C9F    75.563
    https://sra-download.ncbi.nlm.nih.gov/srapub/SRR1568176     5855    300 2014-09-09 01:25:02 ILLUMINA    SRX695081   Plasmodium vivax        SRS694268   SRA181560   SRR1568176  WGS GENOMIC PVHS0102    SRP046206   6915622 2014-09-06 00:50:14     1396955644  Hybrid Selection    BI  SAMN02677080            simple          no  926940223B188509795D62D03B0D4BE8    202 PAIRED  PRJNA240383 assembly    Illumina HiSeq 2000             240383      743 6915622 Plasmodium vivax Brazil08   public  524860DD2F5DD0DA8AB8A66A8241B166    104.333
    https://sra-download.ncbi.nlm.nih.gov/srapub/SRR1568808     5855    310 2014-09-07 11:47:40 ILLUMINA    SRX695611   Plasmodium vivax        SRS694268   SRA181560   SRR1568808  WGS GENOMIC Pond-283082 SRP046206   131 2014-09-07 11:47:06     26462   Hybrid Selection    BI  SAMN02677080            simple          no  05962A1B8A19ED467F6ED1CA45194541    202 PAIRED  PRJNA240383     Illumina MiSeq              240383      0   131 Plasmodium vivax Brazil08   public  D43DF2D3092E7378355F87D955BDCEF1    0
    https://sra-download.ncbi.nlm.nih.gov/srapub/SRR575087      31273   200 2015-12-22 10:37:30 ILLUMINA    SRX188939   Plasmodium vivax strain Belem   2   SRS365050   SRA058771   SRR575087   WGS GENOMIC     SRP015757   81446663    2015-07-22 17:09:26     16289332600 RANDOM  CLEVELAND CLINIC FOUNDATION SRS365050           simple          no  7C262756885EE2E9FBD7B1984A3BFC5D    200 PAIRED  PRJNA175266     Illumina HiSeq 2000             175266      9481    81446663    Bel public  65B4906B84B7506BFF60BC70B07D93AC    0
    https://sra-download.ncbi.nlm.nih.gov/srapub/SRR1740345     5855    0   2015-01-05 23:41:45 ILLUMINA    SRX828494   Plasmodium vivax        SRS807544   SRA223859   SRR1740345  WGS GENOMIC C0924 Day 0 SRP051660   30615773    2015-08-10 00:00:00     6184386146  unspecified NEW YORK UNIVERSITY SAMN03275225            simple          no  EFCB3237203373DFA7F10B6686BDB8BC    202 PAIRED  PRJNA271480     Illumina HiSeq 2500             271480      3646    30615773    C0924 Day 0 public  5C5F0F326076B3C75992A20C74FC6242    0
    https://sra-download.ncbi.nlm.nih.gov/srapub/SRR1740587     5855    0   2015-01-06 00:11:15 ILLUMINA    SRX828757   Plasmodium vivax        SRS807544   SRA223859   SRR1740587  WGS GENOMIC C0924 Day 0 SRP051660   27013208    2015-08-10 00:00:00     2728334008  unspecified NEW YORK UNIVERSITY SAMN03275225            simple          no  F0FFC9DAABC6E88955CCE03C21EAE076    101 SINGLE  PRJNA271480     Illumina HiSeq 2500             271480      1596    0   C0924 Day 0 public  9A49803EEA8A6282EB80DB2613737730    0
    https://sra-download.ncbi.nlm.nih.gov/srapub/SRR1759594     5855    0   2015-01-14 19:29:08 ILLUMINA    SRX843590   Plasmodium vivax        SRS819741   SRA232158   SRR1759594  WGS GENOMIC PNG076  SRP046126   32904304    2015-08-10 00:00:00     6646669408  Hybrid Selection    NEW YORK UNIVERSITY SAMN03284630            simple          no  A3D5A4287388B1210A08043DB0BD2603    202 PAIRED  PRJNA240530     Illumina HiSeq 2000             240530      3881    32904304    Plasmodium_vivax_PNG076 public  ABCD36B55D6DA62D4AEFD5BD7ACD885E    0
cat out_urls.tab.txt
identifier  run url
SRP046206   SRR1565149  ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR156/SRR1565149/SRR1565149.sra
SRP046206   SRR1568176  ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR156/SRR1568176/SRR1568176.sra
SRP046206   SRR1568808  ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR156/SRR1568808/SRR1568808.sra
SRX188939   SRR575087   ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR575/SRR575087/SRR575087.sra
SRS807544   SRR1740345  ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR174/SRR1740345/SRR1740345.sra
SRS807544   SRR1740587  ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR174/SRR1740587/SRR1740587.sra
SRR1759594  SRR1759594  ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR175/SRR1759594/SRR1759594.sra

Downloading threaded

Also below is a script that can be used on the url table created by getSraRunsFromAccIds.py above.
This depends on the URLGrabber python library

downloadSraFromTable.py

pip install URLGrabber
Requirement already satisfied (use --upgrade to upgrade): URLGrabber in /usr/local/lib/python2.7/dist-packages
Cleaning up...

downloadSraFromTable.py

#!/usr/bin/env python
import shutil, os, argparse, sys, stat, urllib

from urlgrabber.grabber import URLGrabber
import requests
import csv
from joblib import Parallel, delayed
import traceback

def get_file_if_size_diff(url, d):
    fn = url.split('/')[-1]
    out_fnp = os.path.join(d, fn)
    g = URLGrabber(reget = "simple")
    locFnp = g.urlgrab(url, out_fnp)
    return locFnp
    
def downloadFileAttempt(url, directoryName):
    try:
        return get_file_if_size_diff(url, directoryName)
    except Exception, err:
        print ("Failed  to download " + str(url) + " to " + str(directoryName) + ", mess: " + str(err))
        traceback.print_exc()
        
def parse_args_downloadFiles():
    parser = argparse.ArgumentParser()
    parser.add_argument('--sraUrlFnp', type=str, required = True, help = "SRA url file created by getSraRunsFromAccIds.py")
    parser.add_argument('--outDirectory', type=str, default = "./", help = "The directory in which to download the files")
    parser.add_argument('--ncpus', type=int, default = 1, help = "Number of cpus to use")
    return parser.parse_args()   


if __name__ == "__main__":
    args = parse_args_downloadFiles()
    with open(args.sraUrlFnp, 'rb') as urlFile:
        reader =  csv.DictReader(urlFile, delimiter='\t')
        urls = []
        for row in reader:
            urls.append(row.get("url"))
        downloadedFiles = Parallel(n_jobs = args.ncpus)(delayed(downloadFileAttempt)(url, "./") for url in urls )
        print(downloadedFiles)

Running the script

./downloadSraFromTable.py --sraUrlFnp out_urls.tab.txt --ncpus 6 --outDirectory ./
['./SRR1565149.sra', './SRR1568176.sra', './SRR1568808.sra', './SRR575087.sra', './SRR1740345.sra', './SRR1740587.sra', './SRR1759594.sra']

The script will often time out on requests especially if you do a lot of downloads at once but simply re-run script will allow you to download all the files since it will check to see if a file is already downloaded and is the right file, so the script can be re-ran until all files have finished downloading.

./downloadSraFromTable.py --sraUrlFnp out_urls.tab.txt --ncpus 6 --outDirectory ./
['./SRR1565149.sra', './SRR1568176.sra', './SRR1568808.sra', './SRR575087.sra', './SRR1740345.sra', './SRR1740587.sra', './SRR1759594.sra']