API Reference

dryad2dataverse

Dryad to Dataverse utilities. No modules are loaded by default, so

>>> import dryad2dataverse

will work, but will have no effect.

Modules included:

  • dryad2dataverse.constants : “Constants” for all modules. URLs, API keys, etc are all here.

  • dryad2dataverse.serializer : Download and serialize Dryad JSON to Dataverse JSON.

  • dryad2dataverse.transfer : metadata and file transfer utilities.

  • dryad2dataverse.monitor : Monitoring and database tools for maintaining a pipeline to Dataverse without unnecessary downloading and file duplication.

  • dryad2dataverse.exceptions : Custom exceptions.

dryad2dataverse.serializer

Serializes Dryad study JSON to Dataverse JSON, as well as producing associated file information.

Serializer

Serializes Dryad JSON to Dataverse JSON

Source code in src/dryad2dataverse/serializer.py
class Serializer():
    '''
    Serializes Dryad JSON to Dataverse JSON
    '''
    CC0='''<p>
    <img src="https://licensebuttons.net/p/zero/1.0/88x31.png" title="Creative Commons CC0 1.0 Universal Public Domain Dedication. " style="display:none" onload="this.style.display='inline'" />
    <a href="http://creativecommons.org/publicdomain/zero/1.0" title="Creative Commons CC0 1.0 Universal Public Domain Dedication. " target="_blank">CC0 1.0</a>
    </p>'''

    def __init__(self, doi):
        '''
        Creates Dryad study metadata instance.

        Parameters
        ----------
        doi : str
            DOI of Dryad study. Required for downloading.
            eg: 'doi:10.5061/dryad.2rbnzs7jp'
        '''
        self.doi = doi
        self._dryadJson = None
        self._fileJson = None
        self._dvJson = None
        #Serializer objects will be assigned a Dataverse study PID
        #if dryad2Dataverse.transfer.Transfer() is instantiated
        self.dvpid = None
        self.session = requests.Session()
        self.session.mount('https://',
                           HTTPAdapter(max_retries=constants.RETRY_STRATEGY))
        LOGGER.debug('Creating Serializer instance object')

    def fetch_record(self, url=None, timeout=45):
        '''
        Fetches Dryad study record JSON from Dryad V2 API at
        https://datadryad.org/api/v2/datasets/.
        Saves to self._dryadJson. Querying Serializer.dryadJson
        will call this function automatically.

        Parameters
        ----------
        url : str
            Dryad instance base URL (eg: 'https://datadryad.org').
        timeout : int
            Timeout in seconds. Default 45.
        '''
        if not url:
            url = constants.DRYURL
        try:
            headers = {'accept':'application/json',
                       'Content-Type':'application/json'}
            headers.update(USER_AGENT)
            doiClean = urllib.parse.quote(self.doi, safe='')
            resp = self.session.get(f'{url}/api/v2/datasets/{doiClean}',
                                    headers=headers, timeout=timeout)
            resp.raise_for_status()
            self._dryadJson = resp.json()
        except (requests.exceptions.HTTPError,
                requests.exceptions.ConnectionError) as err:
            LOGGER.error('URL error for: %s', url)
            LOGGER.exception(err)
            raise

    @property
    def id(self):
        '''
        Returns Dryad unique *database* ID, not the DOI.

        Where the original Dryad JSON is dryadJson, it's the integer
        trailing portion of:

        `self.dryadJson['_links']['stash:version']['href']`
        '''
        href = self.dryadJson['_links']['stash:version']['href']
        index = href.rfind('/') + 1
        return int(href[index:])

    @property
    def dryadJson(self):
        '''
        Returns Dryad study JSON. Will call Serializer.fetch_record() if
        no JSON is present.
        '''
        if not self._dryadJson:
            self.fetch_record()
        return self._dryadJson

    @dryadJson.setter
    def dryadJson(self, value=None):
        '''
        Fetches Dryad JSON from Dryad website if not supplied.

        If supplying it, make sure it's correct or you will run into trouble
        with processing later.

        Parameters
        ----------
        value : dict
            Dryad JSON.
        '''
        if value:
            self._dryadJson = value
        else:
            self.fetch_record()

    @property
    def embargo(self)->bool:
        '''
        Check embargo status. Returns boolean True if embargoed.

        '''
        if self.dryadJson.get('curationStatus') == 'Embargoed':
            return True
        return False

    @property
    def dvJson(self):
        '''
        Returns Dataverse study JSON as dict.
        '''
        self._assemble_json()
        return self._dvJson

    @property
    def fileJson(self, timeout=45):
        '''
        Returns a list of file JSONs from call to Dryad API /files/{id},
        where the ID is parsed from the Dryad JSON. Dryad file listings
        are paginated, so the return consists of a list of dicts, one
        per page.

        Parameters
        ----------
        timeout : int
            Request timeout in seconds.
        '''
        if not self._fileJson:
            try:
                self._fileJson = []
                headers = {'accept':'application/json',
                           'Content-Type':'application/json'}
                headers.update(USER_AGENT)
                fileList = self.session.get(f'{constants.DRYURL}/api/v2/versions/{self.id}/files',
                                            headers=headers,
                                            timeout=timeout)
                fileList.raise_for_status()
                #total = fileList.json()['total'] #Not needed
                lastPage = fileList.json()['_links']['last']['href']
                pages = int(lastPage[lastPage.rfind('=')+1:])
                self._fileJson.append(fileList.json())
                for i in range(2, pages+1):
                    fileCont = self.session.get(f'{constants.DRYURL}/api/v2'
                                                f'/versions/{self.id}/files?page={i}',
                                                headers=headers,
                                                timeout=timeout)
                    fileCont.raise_for_status()
                    self._fileJson.append(fileCont.json())
            except Exception as e:
                LOGGER.exception(e)
                raise
        return self._fileJson

    @property
    def files(self)->list:
        '''
        Returns a list of tuples with:

        (Download_location, filename, mimetype, size, description,
         digest, digestType )

        Digest types include, but are not necessarily limited to:

        'adler-32','crc-32','md2','md5','sha-1','sha-256',
        'sha-384','sha-512'
        '''
        out = []
        for page in self.fileJson:
            files = page['_embedded'].get('stash:files')
            if files:
                for f in files:
                    #This broke with this commit:
                    # https://github.com/datadryad/dryad-app/commit/b8a333ba34b14e55cbc1d7ed5aa4451e0f41db66

                    #downLink = f['_links']['stash:file-download']['href']
                    downLink = f['_links']['stash:download']['href']
                    downLink = f'{constants.DRYURL}{downLink}'
                    name = f['path']
                    mimeType = f['mimeType']
                    size = f['size']
                    #HOW ABOUT PUTTING THIS IN THE DRYAD API PAGE?
                    descr = f.get('description', '')
                    digestType = f.get('digestType', '')
                    #not all files have a digest
                    digest = f.get('digest', '')
                    #Does it matter? If the primary use case is to
                    #compare why not take all the digest types.
                    #md5 = ''
                    #if digestType == 'md5' and digest:
                    #    md5 = digest
                    #    #nothing in the docs as to algorithms so just picking md5
                    #    #Email from Ryan Scherle 30 Nov 20: supported digest type
                    #    #('adler-32','crc-32','md2','md5','sha-1','sha-256',
                    #    #'sha-384','sha-512')
                    out.append((downLink, name, mimeType, size, descr, digestType,
                                digest))

        return out

    @property
    def oversize(self, maxsize=None):
        '''
        Returns a list of Dryad files whose size value
        exceeds maxsize. Maximum size defaults to
        dryad2dataverse.constants.MAX_UPLOAD

        Parameters
        ----------
        maxsize : int
            Size in bytes in which to flag as oversize.
            Defaults to constants.MAX_UPLOAD.
        '''
        if not maxsize:
            maxsize = constants.MAX_UPLOAD
        toobig = []
        for f in self.files:
            if f[3] >= maxsize:
                toobig.append(f)
        return toobig

    #def_typeclass(self, typeName, multiple, typeClass):
    @staticmethod
    def _typeclass(typeName, multiple, typeClass):
        '''
        Creates wrapper around single or multiple Dataverse JSON objects.
        Returns a dict *without* the  Dataverse 'value' key'.

        Parameters
        ----------
        typeName : str
            Dataverse typeName (eg: 'author').

        multiple : boolean
            "Multiple" value in Dataverse JSON.

        typeClass : str
            Dataverse typeClass. Usually one of 'compound', 'primitive,
              'controlledVocabulary').
        '''
        return {'typeName':typeName, 'multiple':multiple,
                'typeClass':typeClass}

    @staticmethod
    def _convert_generic(**kwargs):
        '''
        Generic dataverse json segment creator of form:
            ```
            {dvField:
                {'typeName': dvField,
                  'value': dryField}
            ```
        Suitable for generalized conversions. Only provides fields with
        multiple: False and typeclass:Primitive

        Parameters
        ----------
        kwargs : dict
            Dict from Dataverse JSON segment

        Other parameters
        ----------------
        dvField : str
            Dataverse output field

        dryField : str
            Dryad JSON field to convert

        inJson : dict
            Dryad JSON **segment** to convert

        addJSON : dict (optional)
        	any other JSON required to complete (cf ISNI)

        rType : str
        	'dict' (default) or 'list'.
            Returns 'value' field as dict value or list.

        pNotes : str
            Notes to be prepended to list type values.
            No trailing space required.
        '''

        dvField = kwargs.get('dvField')
        dryField = kwargs.get('dryField')
        inJson = kwargs.get('inJson')
        addJson = kwargs.get('addJson')
        pNotes = kwargs.get('pNotes', '')
        rType = kwargs.get('rType', 'dict')
        if not dvField or not dryField or not inJson:
            try:
                raise ValueError('Incorrect or insufficient fields provided')
            except ValueError as e:
                LOGGER.exception(e)
                raise
        outfield = inJson.get(dryField)
        if outfield:
            outfield = outfield.strip()
        #if not outfield:
        #    raise ValueError(f'Dryad field {dryField} not found')
        # If value missing can still concat empty dict
        if not outfield:
            return {}
        if rType == 'list':
            if pNotes:
                outfield = [f'{pNotes} {outfield}']

        outJson = {dvField:{'typeName':dvField,
                            'multiple': False,
                            'typeClass':'primitive',
                            'value': outfield}}
        #Simple conversion
        if not addJson:
            return outJson

        #Add JSONs together
        addJson.update(outJson)
        return addJson

    @staticmethod
    def _convert_author_names(author):
        '''
        Produces required author json fields.
        This is a special case, requiring concatenation of several fields.

        Parameters
        ----------
        author : dict
        	dryad['author'] JSON segment.
        '''
        first = author.get('firstName')
        last = author.get('lastName')
        if first + last is None:
            return None
        authname = f"{author.get('lastName','')}, {author.get('firstName', '')}"
        return {'authorName':
                {'typeName':'authorName', 'value': authname,
                 'multiple':False, 'typeClass':'primitive'}}

    @staticmethod
    def _convert_keywords(*args):
        '''
        Produces the insane keyword structure Dataverse JSON segment
        from a list of words.

        Parameters
        ----------
        args : list
            List with elements as strings.
        	Generally input is Dryad JSON 'keywords', ie *Dryad['keywords'].
            Don't forget to expand the list using *.
        '''
        outlist = []
        for arg in args:
            outlist.append({'keywordValue': {
                'typeName':'keywordValue',
                'value': arg}})
        return outlist

    @staticmethod
    def _convert_notes(dryJson):
        '''
        Returns formatted notes field with Dryad JSON values that
        don't really fit anywhere into the Dataverse JSON.

        Parameters
        ----------
        dryJson : dict
        	Dryad JSON as dict.
        '''
        notes = ''
        #these fields should be concatenated into notes
        notable = ['versionNumber',
                   'versionStatus',
                   'manuscriptNumber',
                   'curationStatus',
                   'preserveCurationStatus',
                   'invoiceId',
                   'sharingLink',
                   'loosenValidation',
                   'skipDataciteUpdate',
                   'storageSize',
                   'visibility',
                   'skipEmails']
        for note in notable:
            text = dryJson.get(note)
            if text:
                text = str(text).strip()
                if note == 'versionNumber':
                    text = f'<b>Dryad version number:</b> {text}'
                if note == 'versionStatus':
                    text = f'<b>Version status:</b> {text}'
                if note == 'manuscriptNumber':
                    text = f'<b>Manuscript number:</b> {text}'
                if note == 'curationStatus':
                    text = f'<b>Dryad curation status:</b> {text}'
                if note == 'preserveCurationStatus':
                    text = f'<b>Dryad preserve curation status:</b> {text}'
                if note == 'invoiceId':
                    text = f'<b>Invoice ID:</b> {text}'
                if note == 'sharingLink':
                    text = f'<b>Sharing link:</b> {text}'
                if note == 'loosenValidation':
                    text = f'<b>Loosen validation:</b> {text}'
                if note == 'skipDataciteUpdate':
                    text = f'<b>Skip Datacite update:</b> {text}'
                if note == 'storageSize':
                    text = f'<b>Storage size:</b> {text}'
                if note == 'visibility':
                    text = f'<b>Visibility:</b> {text}'
                if note == 'skipEmails':
                    text = f'<b>Skip emails:</b> {text}'

                notes += f'<p>{text}</p>\n'
        concat = {'typeName':'notesText',
                  'multiple':False,
                  'typeClass': 'primitive',
                  'value': notes}
        return concat

    @staticmethod
    def _boundingbox(north, south, east, west):
        '''
        Makes a Dataverse bounding box from appropriate coordinates.
        Returns Dataverse JSON segment as dict.

        Parameters
        ----------
        north : float
        south : float
        east : float
        west : float

        Notes
        -----
            Coordinates in decimal degrees.
        '''
        names = ['north', 'south', 'east', 'west']
        points = [str(x) for x in [north, south, east, west]]
        #Because coordinates in DV are strings BFY
        coords = [(x[0]+'Longitude', {x[0]:x[1]}) for x in zip(names, points)]
        #Yes, everything is longitude in Dataverse
        out = []
        for coord in coords:
            out.append(Serializer._convert_generic(inJson=coord[1],
                                                   dvField=coord[0],
                                                   #dryField='north'))
                                                   dryField=[k for k in coord[1].keys()][0]))
        return out

    @staticmethod
    def _convert_geospatial(dryJson):
        '''
        Outputs Dataverse geospatial metadata block.

        Parameters
        ----------
        dryJson : dict
        	Dryad json as dict.
        '''
        if dryJson.get('locations'):
            #out = {}
            coverage = []
            box = []
            otherCov = None
            gbbox = None
            for loc in dryJson.get('locations'):
                if loc.get('place'):
                    #These are impossible to clean. Going to "other" field

                    other = Serializer._convert_generic(inJson=loc,
                                                        dvField='otherGeographicCoverage',
                                                        dryField='place')
                    coverage.append(other)


                if loc.get('point'):
                    #makes size zero bounding box
                    north = loc['point']['latitude']
                    south = north
                    east = loc['point']['longitude']
                    west = east
                    point = Serializer._boundingbox(north, south, east, west)
                    box.append(point)

                if loc.get('box'):
                    north = loc['box']['neLatitude']
                    south = loc['box']['swLatitude']
                    east = loc['box']['neLongitude']
                    west = loc['box']['swLongitude']
                    area = Serializer._boundingbox(north, south, east, west)
                    box.append(area)

            if coverage:
                otherCov = Serializer._typeclass(typeName='geographicCoverage',
                                                 multiple=True, typeClass='compound')
                otherCov['value'] = coverage

            if box:
                gbbox = Serializer._typeclass(typeName='geographicCoverage',
                                              multiple=True, typeClass='compound')
                gbbox['value'] = box

            if otherCov or gbbox:
                gblock = {'geospatial': {'displayName' : 'Geospatial Metadata',
                                         'fields': []}}
                if otherCov:
                    gblock['geospatial']['fields'].append(otherCov)
                if gbbox:
                    gblock['geospatial']['fields'].append(gbbox)
            return gblock
        return {}

    def _assemble_json(self, dryJson=None, dvContact=None,
                       dvEmail=None, defContact=True):
        '''
        Assembles Dataverse json from Dryad JSON components.
        Dataverse JSON is a nightmare, so this function is too.

        Parameters
        ----------
        dryJson : dict
        	Dryad json as dict.

        dvContact : str
        	Default Dataverse contact name.

        dvEmail : str
        	Default Dataverse 4 contact email address.

        defContact : boolean
        	Flag to include default contact information with record.
        '''
        if not dvContact:
            dvContact = constants.DV_CONTACT_NAME
        if not dvEmail:
            dvEmail = constants.DV_CONTACT_EMAIL
        if not dryJson:
            dryJson = self.dryadJson
        LOGGER.debug(dryJson)
        #Licence block changes ensure that it will only work with
        #Dataverse v5.10+
        #Go back to previous commits to see the earlier "standard"
        self._dvJson = {'datasetVersion':
                        {'license':{'name': 'CC0 1.0',
                                    'uri': 'http://creativecommons.org/publicdomain/zero/1.0' },
                         'termsOfUse': Serializer.CC0,
                         'metadataBlocks':{'citation':
                                           {'displayName': 'Citation Metadata',
                                            'fields': []},
                                           }
                         }
                        }
        #REQUIRED Dataverse fields

        #Dryad is a general purpose database; it is hard/impossible to get
        #Dataverse required subject tags out of their keywords, so:
        defaultSubj = {'typeName' : 'subject',
                       'typeClass':'controlledVocabulary',
                       'multiple': True,
                       'value' : ['Other']}
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(defaultSubj)

        reqdTitle = Serializer._convert_generic(inJson=dryJson,
                                                dryField='title',
                                                dvField='title')['title']

        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(reqdTitle)

        #authors
        out = []
        for a in dryJson['authors']:
            reqdAuthor = Serializer._convert_author_names(a)
            if reqdAuthor:
                affiliation = Serializer._convert_generic(inJson=a,
                                                          dvField='authorAffiliation',
                                                          dryField='affiliation')
                addOrc = {'authorIdentifierScheme':
                          {'typeName':'authorIdentifierScheme',
                           'value': 'ORCID',
                           'typeClass': 'controlledVocabulary',
                           'multiple':False}}
                #only ORCID at UBC
                orcid = Serializer._convert_generic(inJson=a,
                                                    dvField='authorIdentifier',
                                                    dryField='orcid',
                                                    addJson=addOrc)
                if affiliation:
                    reqdAuthor.update(affiliation)
                if orcid:
                    reqdAuthor.update(orcid)
                out.append(reqdAuthor)

        authors = Serializer._typeclass(typeName='author',
                                        multiple=True, typeClass='compound')
        authors['value'] = out

        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(authors)


        ##rewrite as function:contact
        out = []
        for e in dryJson['authors']:
            reqdContact = Serializer._convert_generic(inJson=e,
                                                      dvField='datasetContactEmail',
                                                      dryField='email')
            if reqdContact:
                author = Serializer._convert_author_names(e)
                author = {'author':author['authorName']['value']}
                #for passing to function
                author = Serializer._convert_generic(inJson=author,
                                                     dvField='datasetContactName',
                                                     dryField='author')
                if author:
                    reqdContact.update(author)
                affiliation = Serializer._convert_generic(inJson=e,
                                                          dvField='datasetContactAffiliation',
                                                          dryField='affiliation')
                if affiliation:
                    reqdContact.update(affiliation)
                out.append(reqdContact)

        if defContact:
            #Adds default contact information the tail of the list
            defEmail = Serializer._convert_generic(inJson={'em':dvEmail},
                                                   dvField='datasetContactEmail',
                                                   dryField='em')
            defName = Serializer._convert_generic(inJson={'name':dvContact},
                                                  dvField='datasetContactName',
                                                  dryField='name')
            defEmail.update(defName)
            out.append(defEmail)

        contacts = Serializer._typeclass(typeName='datasetContact',
                                         multiple=True, typeClass='compound')
        contacts['value'] = out
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(contacts)

        #Description
        description = Serializer._typeclass(typeName='dsDescription',
                                            multiple=True, typeClass='compound')
        desCat = [('abstract', '<b>Abstract</b><br/>'),
                  ('methods', '<b>Methods</b><br />'),
                  ('usageNotes', '<b>Usage notes</b><br />')]
        out = []
        for desc in desCat:
            if dryJson.get(desc[0]):
                descrField = Serializer._convert_generic(inJson=dryJson,
                                                         dvField='dsDescriptionValue',
                                                         dryField=desc[0])
                descrField['dsDescriptionValue']['value'] = (desc[1]
                                                             + descrField['dsDescriptionValue']['value'])

                descDate = Serializer._convert_generic(inJson=dryJson,
                                                       dvField='dsDescriptionDate',
                                                       dryField='lastModificationDate')
                descrField.update(descDate)
                out.append(descrField)

        description['value'] = out
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(description)

        #Granting agencies
        if dryJson.get('funders'):

            out = []
            for fund in dryJson['funders']:
                org = Serializer._convert_generic(inJson=fund,
                                                  dvField='grantNumberAgency',
                                                  dryField='organization')
                if fund.get('awardNumber'):
                    fund = Serializer._convert_generic(inJson=fund,
                                                       dvField='grantNumberValue',
                                                       dryField='awardNumber')
                    org.update(fund)
                out.append(org)
            grants = Serializer._typeclass(typeName='grantNumber',
                                           multiple=True, typeClass='compound')
            grants['value'] = out
            self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(grants)

        #Keywords
        keywords = Serializer._typeclass(typeName='keyword',
                                         multiple=True, typeClass='compound')
        out = []
        for key in dryJson.get('keywords', []):
            #Apparently keywords are not required
            keydict = {'keyword':key}
            #because takes a dict
            kv = Serializer._convert_generic(inJson=keydict,
                                             dvField='keywordValue',
                                             dryField='keyword')
            vocab = {'dryad':'Dryad'}
            voc = Serializer._convert_generic(inJson=vocab,
                                              dvField='keywordVocabulary',
                                              dryField='dryad')
            kv.update(voc)
            out.append(kv)
        keywords['value'] = out
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(keywords)

        #modification date
        moddate = Serializer._convert_generic(inJson=dryJson,
                                              dvField='dateOfDeposit',
                                              dryField='lastModificationDate')
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(moddate['dateOfDeposit'])
        #This one isn't nested BFY

        #distribution date
        distdate = Serializer._convert_generic(inJson=dryJson,
                                               dvField='distributionDate',
                                               dryField='publicationDate')
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(distdate['distributionDate'])
        #Also not nested

        #publications
        publications = Serializer._typeclass(typeName='publication',
                                             multiple=True,
                                             typeClass='compound')
        #quick and dirty lookup table
        #TODONE see https://github.com/CDL-Dryad/dryad-app/blob/
        #31d17d8dab7ea3bab1256063a1e4d0cb706dd5ec/stash/stash_datacite/
        #app/models/stash_datacite/related_identifier.rb
        #no longer required
        #lookup = {'IsDerivedFrom':'Is derived from',
        #          'Cites':'Cites',
        #          'IsSupplementTo': 'Is supplement to',
        #          'IsSupplementedBy': 'Is supplemented by'}
        out = []
        if dryJson.get('relatedWorks'):
            for r in dryJson.get('relatedWorks'):
                #id = r.get('identifier')
                #TODONE Verify that changing id to _id has not broken anything: 11Feb21
                _id = r.get('identifier')
                #Note:10 Feb 2021 : some records have identifier = ''. BAD DRYAD.
                if not _id:
                    continue
                relationship = r.get('relationship')
                #idType = r.get('identifierType') #not required in _convert_generic
                #citation = {'citation': f"{lookup[relationship]}: {id}"}
                citation = {'citation': relationship.capitalize()}
                pubcite = Serializer._convert_generic(inJson=citation,
                                                      dvField='publicationCitation',
                                                      dryField='citation')
                pubIdType = Serializer._convert_generic(inJson=r,
                                                        dvField='publicationIDType',
                                                        dryField='identifierType')
                #ID type must be lower case
                pubIdType['publicationIDType']['value'] = pubIdType['publicationIDType']['value'].lower()
                pubIdType['publicationIDType']['typeClass'] = 'controlledVocabulary'

                pubUrl = Serializer._convert_generic(inJson=r,
                                                     dvField='publicationURL',
                                                     dryField='identifier')

                #Dryad doesn't just put URLs in their URL field.
                if pubUrl['publicationURL']['value'].lower().startswith('doi:'):
                    fixurl = 'https://doi.org/' + pubUrl['publicationURL']['value'][4:]
                    pubUrl['publicationURL']['value'] = fixurl
                    LOGGER.debug('Rewrote URLs to be %s', fixurl)

                #Dryad doesn't validate URL fields to start with http or https. Assume https
                if not pubUrl['publicationURL']['value'].lower().startswith('htt'):
                    pubUrl['publicationURL']['value'] = ('https://' +
                                                         pubUrl['publicationURL']['value'])
                pubcite.update(pubIdType)
                pubcite.update(pubUrl)
                out.append(pubcite)
        publications['value'] = out
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(publications)
        #notes
        #go into primary notes field, not DDI
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(Serializer._convert_notes(dryJson))

        #Geospatial metadata
        self._dvJson['datasetVersion']['metadataBlocks'].update(Serializer._convert_geospatial(dryJson))

        #DOI --> agency/identifier
        doi = Serializer._convert_generic(inJson=dryJson, dryField='identifier',
                                          dvField='otherIdValue')
        doi.update(Serializer._convert_generic(inJson={'agency':'Dryad'},
                                               dryField='agency',
                                               dvField='otherIdAgency'))
        agency = Serializer._typeclass(typeName='otherId',
                                       multiple=True, typeClass='compound')
        agency['value'] = [doi]
        self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(agency)

dryadJson property writable

Returns Dryad study JSON. Will call Serializer.fetch_record() if no JSON is present.

dvJson property

Returns Dataverse study JSON as dict.

embargo property

Check embargo status. Returns boolean True if embargoed.

fileJson property

Returns a list of file JSONs from call to Dryad API /files/{id}, where the ID is parsed from the Dryad JSON. Dryad file listings are paginated, so the return consists of a list of dicts, one per page.

Parameters:
  • timeout (int) –

    Request timeout in seconds.

files property

Returns a list of tuples with:

(Download_location, filename, mimetype, size, description, digest, digestType )

Digest types include, but are not necessarily limited to:

‘adler-32’,’crc-32’,’md2’,’md5’,’sha-1’,’sha-256’, ‘sha-384’,’sha-512’

id property

Returns Dryad unique database ID, not the DOI.

Where the original Dryad JSON is dryadJson, it’s the integer trailing portion of:

self.dryadJson['_links']['stash:version']['href']

oversize property

Returns a list of Dryad files whose size value exceeds maxsize. Maximum size defaults to dryad2dataverse.constants.MAX_UPLOAD

Parameters:
  • maxsize (int) –

    Size in bytes in which to flag as oversize. Defaults to constants.MAX_UPLOAD.

__init__(doi)

Creates Dryad study metadata instance.

Parameters:
  • doi (str) –

    DOI of Dryad study. Required for downloading. eg: ‘doi:10.5061/dryad.2rbnzs7jp’

Source code in src/dryad2dataverse/serializer.py
def __init__(self, doi):
    '''
    Creates Dryad study metadata instance.

    Parameters
    ----------
    doi : str
        DOI of Dryad study. Required for downloading.
        eg: 'doi:10.5061/dryad.2rbnzs7jp'
    '''
    self.doi = doi
    self._dryadJson = None
    self._fileJson = None
    self._dvJson = None
    #Serializer objects will be assigned a Dataverse study PID
    #if dryad2Dataverse.transfer.Transfer() is instantiated
    self.dvpid = None
    self.session = requests.Session()
    self.session.mount('https://',
                       HTTPAdapter(max_retries=constants.RETRY_STRATEGY))
    LOGGER.debug('Creating Serializer instance object')

fetch_record(url=None, timeout=45)

Fetches Dryad study record JSON from Dryad V2 API at https://datadryad.org/api/v2/datasets/. Saves to self._dryadJson. Querying Serializer.dryadJson will call this function automatically.

Parameters:
  • url (str, default: None ) –

    Dryad instance base URL (eg: ‘https://datadryad.org’).

  • timeout (int, default: 45 ) –

    Timeout in seconds. Default 45.

Source code in src/dryad2dataverse/serializer.py
def fetch_record(self, url=None, timeout=45):
    '''
    Fetches Dryad study record JSON from Dryad V2 API at
    https://datadryad.org/api/v2/datasets/.
    Saves to self._dryadJson. Querying Serializer.dryadJson
    will call this function automatically.

    Parameters
    ----------
    url : str
        Dryad instance base URL (eg: 'https://datadryad.org').
    timeout : int
        Timeout in seconds. Default 45.
    '''
    if not url:
        url = constants.DRYURL
    try:
        headers = {'accept':'application/json',
                   'Content-Type':'application/json'}
        headers.update(USER_AGENT)
        doiClean = urllib.parse.quote(self.doi, safe='')
        resp = self.session.get(f'{url}/api/v2/datasets/{doiClean}',
                                headers=headers, timeout=timeout)
        resp.raise_for_status()
        self._dryadJson = resp.json()
    except (requests.exceptions.HTTPError,
            requests.exceptions.ConnectionError) as err:
        LOGGER.error('URL error for: %s', url)
        LOGGER.exception(err)
        raise

dryad2dataverse.transfer

This module handles data downloads and uploads from a Dryad instance to a Dataverse instance

Transfer

Transfers metadata and data files from a Dryad installation to Dataverse installation.

Source code in src/dryad2dataverse/transfer.py
class Transfer():
    '''
    Transfers metadata and data files from a
    Dryad installation to Dataverse installation.
    '''
    def __init__(self, dryad):
        '''
        Creates a dryad2dataverse.transfer.Transfer instance.

        Parameters
        ----------
        dryad : dryad2dataverse.serializer.Serializer
        '''
        self.dryad = dryad
        self._fileJson = None
        self._files = [list(f) for f in self.dryad.files]
        #self._files = copy.deepcopy(self.dryad.files)
        self.fileUpRecord = []
        self.fileDelRecord = []
        self.dvStudy = None
        self.jsonFlag = None #Whether or not new json uploaded
        self.session = requests.Session()
        self.session.mount('https://', HTTPAdapter(max_retries=constants.RETRY_STRATEGY))

    def _del__(self): #TODONE: Change name to __del__ to make a destructor
        '''Expunges files from constants.TMP on deletion'''
        for f in self.files:
            if os.path.exists(f'{constants.TMP}{os.sep}{f[1]}'):
                os.remove(f'{constants.TMP}{os.sep}{f[1]}')

    def test_api_key(self, url=None, apikey=None):
        '''
        Tests for an expired API key and raises
        dryad2dataverse.exceptions.Dryad2dataverseBadApiKeyError
        the API key is bad. Ignores other HTTP errors.

        Parameters
        ----------
        url : str
            Base URL to Dataverse installation.
            Defaults to dryad2dataverse.constants.DVURL
        apikey : str
            Default dryad2dataverse.constants.APIKEY.
        '''
        #API validity check appears to come before a PID validity check
        params = {'persistentId': 'doi:000/000/000'} # PID is irrelevant
        if not url:
            url = constants.DVURL
        headers = {'X-Dataverse-key': apikey if apikey else constants.APIKEY}
        headers.update(USER_AGENT)
        bad_test = self.session.get(f'{url}/api/datasets/:persistentId',
                                headers=headers,
                                params=params)
        #There's an extra space in the message which Harvard
        #will probably find out about, so . . .
        if bad_test.json().get('message').startswith('Bad api key'):
            try:
                raise exceptions.DataverseBadApiKeyError('Bad API key')
            except exceptions.DataverseBadApiKeyError as e:
                LOGGER.critical('API key has expired or is otherwise invalid')
                LOGGER.exception(e)
                #LOGGER.exception(traceback.format_exc()) #not really necessary
                raise
        try: #other errors
            bad_test.raise_for_status()
        except requests.exceptions.HTTPError:
            pass
        except Exception as e:
            LOGGER.exception(e)
            LOGGER.exception(traceback.format_exc())
            raise

    @property
    def dvpid(self):
        '''
        Returns Dataverse study persistent ID as str.
        '''
        return self.dryad.dvpid

    @property
    def auth(self):
        '''
        Returns datavese authentication header dict.
        ie: `{X-Dataverse-key' : 'APIKEYSTRING'}`
        '''
        return {'X-Dataverse-key' : constants.APIKEY}

    @property
    def fileJson(self):
        '''
        Returns a list of file JSONs from call to Dryad API /files/{id},
        where the ID is parsed from the Dryad JSON. Dryad file listings
        are paginated.
        '''
        return self.dryad.fileJson.copy()

    @property
    def files(self):
        '''
        Returns a list of lists with:

        [Download_location, filename, mimetype, size, description, md5digest]

        This is mutable; downloading a file will add md5 info if not available.
        '''
        return self._files

    @property
    def oversize(self):
        '''
        Returns list of files exceeding Dataverse ingest limit
        dryad2dataverse.constants.MAX_UPLOAD.
        '''
        return self.dryad.oversize

    @property
    def doi(self):
        '''
        Returns Dryad DOI.
        '''
        return self.dryad.doi

    @staticmethod
    def _dryad_file_id(url:str):
        '''
        Returns Dryad fileID from dryad file download URL as integer.

        Parameters
        ----------
        url : str
            Dryad file URL in format
            'https://datadryad.org/api/v2/files/385820/download'.
        '''
        fid = url.strip('/download')
        fid = int(fid[fid.rfind('/')+1:])
        return fid

    @staticmethod
    def _make_dv_head(apikey):
        '''
        Returns Dataverse authentication header as dict.

        Parameters
        ----------
        apikey : str
            Dataverse API key.
        '''
        return {'X-Dataverse-key' : apikey}

    #@staticmethod
    def set_correct_date(self, url=None, hdl=None,
                         d_type='distributionDate',
                         apikey=None):
        '''
        Sets "correct" publication date for Dataverse.

        Parameters
        ----------
        url : str
            Base URL to Dataverse installation.
            Defaults to dryad2dataverse.constants.DVURL
        hdl : str
            Persistent indentifier for Dataverse study.
            Defaults to Transfer.dvpid (which can be None if the
            study has not yet been uploaded).
        d_type : str
            Date type. One of  'distributionDate', 'productionDate',
            `dateOfDeposit'. Default 'distributionDate'.
        apikey : str
            Default dryad2dataverse.constants.APIKEY.

        Notes
        -----
        dryad2dataverse.serializer maps Dryad 'publicationDate'
        to Dataverse 'distributionDate' (see serializer.py ~line 675).

        Dataverse citation date default is ":publicationDate". See
        Dataverse API reference:
        <https://guides.dataverse.org/en/4.20/api/native-api.html#id54>.

        '''
        try:
            if not url:
                url = constants.DVURL
            if not hdl:
                hdl = self.dvpid
            headers = {'X-Dataverse-key' : apikey}
            if apikey:
                headers = {'X-Dataverse-key' : apikey}
            else:
                headers = {'X-Dataverse-key' : constants.APIKEY}

            headers.update(USER_AGENT)
            params = {'persistentId': hdl}
            set_date = self.session.put(f'{url}/api/datasets/:persistentId/citationdate',
                                        headers=headers,
                                        data=d_type,
                                        params=params,
                                        timeout=45)
            set_date.raise_for_status()

        except (requests.exceptions.HTTPError,
                requests.exceptions.ConnectionError) as err:
            LOGGER.warning('Unable to set citation date for %s',
                           hdl)
            LOGGER.warning(err)
            LOGGER.warning(set_date.text)

    def upload_study(self, url=None, apikey=None, timeout=45, **kwargs):
        '''
        Uploads Dryad study metadata to target Dataverse or updates existing.
        Supplying a `targetDv` kwarg creates a new study and supplying a
        `dvpid` kwarg updates a currently existing Dataverse study.

        Parameters
        ----------
        url : str
            URL of Dataverse instance. Defaults to constants.DVURL.
        apikey : str
            API key of user. Defaults to contants.APIKEY.
        timeout : int
            timeout on POST request.
        kwargs : dict

        Other parameters
        ----------------
        targetDv : str
            Short name of target dataverse. Required if new dataset.
            Specify as targetDV=value.
        dvpid : str
            Dataverse persistent ID (for updating metadata).
            This is not required for new uploads, specify as dvpid=value

        Notes
        -----
        One of targetDv or dvpid is required.
        '''
        if not url:
            url = constants.DVURL
        if not apikey:
            apikey = constants.APIKEY
        headers = {'X-Dataverse-key' : apikey}
        headers.update(USER_AGENT)
        targetDv = kwargs.get('targetDv')
        dvpid = kwargs.get('dvpid')
        #dryFid = kwargs.get('dryFid') #Why did I put this here?
        if not targetDv and not dvpid:
            try:
                raise exceptions.NoTargetError('You must supply one of targetDv \
                                    (target dataverse) \
                                     or dvpid (Dataverse persistent ID)')
            except exceptions.NoTargetError as e:
                LOGGER.error('No target dataverse or dvpid supplied')
                LOGGER.exception(e)
                raise

        if targetDv and dvpid:
            try:
                raise ValueError('Supply only one of targetDv or dvpid')
            except ValueError as e:
                LOGGER.exception(e)
                raise
        if not dvpid:
            endpoint = f'{url}/api/dataverses/{targetDv}/datasets'
            upload = self.session.post(endpoint,
                                       headers=headers,
                                       json=self.dryad.dvJson,
                                       timeout=timeout)
            LOGGER.debug(upload.text)
        else:
            endpoint = f'{url}/api/datasets/:persistentId/versions/:draft'
            params = {'persistentId':dvpid}
            #Yes, dataverse uses *different* json for edits
            upload = self.session.put(endpoint, params=params,
                                      headers=headers,
                                      json=self.dryad.dvJson['datasetVersion'],
                                      timeout=timeout)
            #self._dvrecord = upload.json()
            LOGGER.debug(upload.text)

        try:
            updata = upload.json()
            self.dvStudy = updata
            if updata.get('status') != 'OK':
                try:
                    raise exceptions.DataverseUploadError(('Status return is not OK.'
                                                           f'{upload.status_code}: '
                                                           f'{upload.reason}. '
                                                           f'{upload.request.url} '
                                                           f'{upload.text}'))
                except exceptions.DataverseUploadError as e:
                    LOGGER.exception(e)
                    LOGGER.exception(traceback.format_exc())
                    raise exceptions.DataverseUploadError(('Status return is not OK.'
                                                           f'{upload.status_code}: '
                                                           f'{upload.reason}. '
                                                           f'{upload.request.url} '
                                                           f'{upload.text}'))
            upload.raise_for_status()
        except Exception as e: # Only accessible via non-requests exception
            LOGGER.exception(e)
            LOGGER.exception(traceback.format_exc())
            raise

        if targetDv:
            self.dryad.dvpid = updata['data'].get('persistentId')
        if dvpid:
            self.dryad.dvpid = updata['data'].get('datasetPersistentId')
        return self.dvpid

    @staticmethod
    def _check_md5(infile, dig_type):
        '''
        Returns the hex digest of a file (formerly just md5sum).

        Parameters
        ----------
        infile : str
            Complete path to target file.
        dig_type : Union[str, None]
            Digest type
        '''
        #From Ryan Scherle
        #When Dryad calculates a digest, it only uses MD5.
        #But if you have precomputed some other type of digest, we should accept it.
        #The list of allowed values is:
        #('adler-32','crc-32','md2','md5','sha-1','sha-256','sha-384','sha-512')
        #hashlib doesn't support adler-32, crc-32, md2

        blocksize = 2**16
        #Well, this is inelegant
        with open(infile, 'rb') as m:
            #fmd5 = hashlib.md5()
            ## var name kept for posterity. Maybe refactor
            if dig_type in ['sha-1', 'sha-256', 'sha-384', 'sha-512', 'md5', 'md2']:
                if dig_type == 'md2':
                    fmd5 = Crypto.Hash.MD2.new()
                else:
                    fmd5 = HASHTABLE[dig_type]()
                fblock = m.read(blocksize)
                while fblock:
                    fmd5.update(fblock)
                    fblock = m.read(blocksize)
                return fmd5.hexdigest()
            if dig_type in ['adler-32', 'crc-32']:
                fblock = m.read(blocksize)
                curvalue = HASHTABLE[dig_type](fblock)
                while fblock:
                    fblock = m.read(blocksize)
                    curvalue = HASHTABLE[dig_type](fblock, curvalue)
                return curvalue
        raise exceptions.HashError(f'Unable to determine hash type for{infile}: {dig_type}')


    def download_file(self, url=None, filename=None, tmp=None,
                      size=None, chk=None, timeout=45, **kwargs):
        '''
        Downloads a file via requests streaming and saves to constants.TMP.
        returns checksum on success and an exception on failure.

        Parameters
        ----------
        url : str
            URL of download.
        filename : str
            Output file name.
        timeout : int
            Requests timeout.
        tmp : str
            Temporary directory for downloads.
            Defaults to dryad2dataverse.constants.TMP.
        size : int
            Reported file size in bytes.
            Defaults to dryad2dataverse.constants.MAX_UPLOAD.
        chk : str
            checksum of file (if available and known).
        timeout : int
            timeout in seconds
        kwargs : dict

        Other parameters
        ----------------
        digest_type : str
            checksum type (ie, md5, sha-256, etc)
        '''
        LOGGER.debug('Start download sequence')
        LOGGER.debug('MAX SIZE = %s', constants.MAX_UPLOAD)
        LOGGER.debug('Filename: %s, size=%s', filename, size)
        if not tmp:
            tmp = constants.TMP
        if tmp.endswith(os.sep):
            tmp = tmp[:-1]

        if size:
            if size > constants.MAX_UPLOAD:
                #TOO BIG
                LOGGER.warning('%s: File %s exceeds '
                               'Dataverse MAX_UPLOAD size. Skipping download.',
                               self.doi, filename)
                md5 = 'this_file_is_too_big_to_upload__' #HA HA
                for i in self._files:
                    if url == i[0]:
                        i[-1] = md5
                LOGGER.debug('Stop download sequence with large file skip')
                return md5
        try:
            down = self.session.get(url, timeout=timeout, stream=True)
            down.raise_for_status()
            with open(f'{tmp}{os.sep}{filename}', 'wb') as fi:
                for chunk in down.iter_content(chunk_size=8192):
                    fi.write(chunk)

            #verify size
            #https://stackoverflow.com/questions/2104080/how-can-i-check-file-size-in-python'
            if size:
                checkSize = os.stat(f'{tmp}{os.sep}{filename}').st_size
                if checkSize != size:
                    try:
                        raise exceptions.DownloadSizeError('Download size does not match '
                                                           'reported size')
                    except exceptions.DownloadSizeError as e:
                        LOGGER.exception(e)
                        raise
            #now check the md5
            md5 = None
            if chk and kwargs.get('digest_type') in HASHTABLE:
                md5 = Transfer._check_md5(f'{tmp}{os.sep}{filename}',
                                      kwargs['digest_type'])
                if md5 != chk:
                    try:
                        raise exceptions.HashError(f'Hex digest mismatch: {md5} : {chk}')
                        #is this really what I want to do on a bad checksum?
                    except exceptions.HashError as e:
                        LOGGER.exception(e)
                        raise
            for i in self._files:
                if url == i[0]:
                    i[-1] = md5
            LOGGER.debug('Complete download sequence')
            #This doesn't actually return an md5, just the hash value
            return md5
        except (requests.exceptions.HTTPError,
                requests.exceptions.ConnectionError) as err:
            LOGGER.critical('Unable to download %s', url)
            LOGGER.exception(err)
            raise exceptions.DataverseDownloadError

    def download_files(self, files=None):
        '''
        Bulk downloader for files.

        Parameters
        ----------
        files : list
            Items in list can be tuples or list with a minimum of:
            `(dryaddownloadurl, filenamewithoutpath, [md5sum])`
            The md5 sum should be the last member of the tuple.
            Defaults to self.files.

        Notes
        -----
        Normally used without arguments to download all the associated
        files with a Dryad study.
        '''
        if not files:
            files = self.files
        try:
            for f in files:
                self.download_file(url=f[0],
                                   filename=f[1],
                                   mimetype=f[2],
                                   size=f[3],
                                   descr=f[4],
                                   digest_type=f[5],
                                   chk=f[-1])
        except exceptions.DataverseDownloadError as e:
            LOGGER.exception('Unable to download file with info %s\n%s', f, e)
            raise

    def file_lock_check(self, study, dv_url, apikey=None, count=0):
        '''
        Checks for a study lock

        Returns True if locked. Normally used to check
        if processing is completed. As tabular processing
        halts file ingest, there should be no locks on a
        Dataverse study before performing a data file upload.

        Parameters
        ----------
        study : str
            Persistent indentifer of study.
        dv_url : str
            URL to base Dataverse installation.
        apikey : str
            API key for user.
            If not present authorization defaults to self.auth.
        count : int
            Number of times the function has been called. Logs
            lock messages only on 0.
        '''
        if dv_url.endswith('/'):
            dv_url = dv_url[:-1]
        if apikey:
            headers = {'X-Dataverse-key': apikey}
        else:
            headers = self.auth

        headers.update(USER_AGENT)
        params = {'persistentId': study}
        try:
            lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
                                           headers=headers,
                                           params=params, timeout=300)
            lock_status.raise_for_status()
            if lock_status.json().get('data'):
                if count == 0:
                    LOGGER.warning('Study %s has been locked', study)
                    LOGGER.warning('Lock info:\n%s', lock_status.json())
                return True
            return False
        except (requests.exceptions.HTTPError,
                requests.exceptions.ConnectionError) as err:
            LOGGER.error('Unable to detect lock status for %s', study)
            LOGGER.error('ERROR message: %s', lock_status.text)
            LOGGER.exception(err)
            #return True #Should I raise here?
            raise

    def force_notab_unlock(self, study, dv_url, apikey=None):
        '''
        Checks for a study lock and forcibly unlocks and uningests
        to prevent tabular file processing. Required if mime and filename
        spoofing is not sufficient.

        **Forcible unlocks require a superuser API key.**

        Parameters
        ----------
        study : str
            Persistent indentifer of study.
        dv_url : str
            URL to base Dataverse installation.
        apikey : str
            API key for user.
            If not present authorization defaults to self.auth.
        '''
        if dv_url.endswith('/'):
            dv_url = dv_url[:-1]
        if apikey:
            headers = {'X-Dataverse-key': apikey}
        else:
            headers = self.auth

        headers.update(USER_AGENT)
        params = {'persistentId': study}
        lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
                                       headers=headers,
                                       params=params, timeout=300)
        lock_status.raise_for_status()
        if lock_status.json()['data']:
            LOGGER.warning('Study %s has been locked', study)
            LOGGER.warning('Lock info:\n%s', lock_status.json())
            force_unlock = self.session.delete(f'{dv_url}/api/datasets/:persistentId/locks',
                                               params=params, headers=headers,
                                               timeout=300)
            force_unlock.raise_for_status()
            LOGGER.warning('Lock removed for %s', study)
            LOGGER.warning('Lock status:\n %s', force_unlock.json())
            #This is what the file ID was for, in case it can
            #be implemented again.
            #According to Harvard, you can't remove the progress bar
            #for uploaded tab files that squeak through unless you
            #let them ingest first then reingest them. Oh well.
            #See:
            #https://groups.google.com/d/msgid/dataverse-community/
            #74caa708-e39b-4259-874d-5b6b74ef9723n%40googlegroups.com
            #Also, you can't uningest it because it hasn't been
            #ingested once it's been unlocked. So the commented
            #code below is useless (for now)
            #uningest = requests.post(f'{dv_url}/api/files/{fid}/uningest',
            #                         headers=headers,
            #                         timeout=300)
            #LOGGER.warning('Ingest halted for file %s for study %s', fid, study)
            #uningest.raise_for_status()

    def upload_file(self, dryadUrl=None, filename=None,
                    mimetype=None, size=None, descr=None,
                    hashtype=None,
                    #md5=None, studyId=None, dest=None,
                    digest=None, studyId=None, dest=None,
                    fprefix=None, force_unlock=False, timeout=300):
        '''
        Uploads file to Dataverse study. Returns a tuple of the
        dryadFid (or None) and Dataverse JSON from the POST request.
        Failures produce JSON with different status messages
        rather than raising an exception.

        Parameters
        ----------
        filename : str
            Filename (not including path).
        mimetype : str
            Mimetype of file.
        size : int
            Size in bytes.
        studyId : str
            Persistent Dataverse study identifier.
            Defaults to Transfer.dvpid.
        dest : str
            Destination dataverse installation url.
            Defaults to constants.DVURL.
        hashtype: str
            original Dryad hash type
        fprefix : str
            Path to file, not including a trailing slash.
        timeout : int
            Timeout in seconds for POST request. Default 300.
        dryadUrl : str
            Dryad download URL if you want to include a Dryad file id.
        force_unlock : bool
            Attempt forcible unlock instead of waiting for tabular
            file processing.
            Defaults to False.
            The Dataverse `/locks` endpoint blocks POST and DELETE requests
            from non-superusers (undocumented as of 31 March 2021).
            **Forcible unlock requires a superuser API key.**
        '''
        #return locals()
        #TODONE remove above
        if not studyId:
            studyId = self.dvpid
        if not dest:
            dest = constants.DVURL
        if not fprefix:
            fprefix = constants.TMP
        if dryadUrl:
            fid = dryadUrl.strip('/download')
            fid = int(fid[fid.rfind('/')+1:])
        else:
            fid = 0 #dummy fid for non-Dryad use
        params = {'persistentId' : studyId}
        upfile = fprefix + os.sep + filename[:]
        badExt = filename[filename.rfind('.'):].lower()
        #Descriptions are technically possible, although how to add
        #them is buried in Dryad's API documentation
        dv4meta = {'label' : filename[:], 'description' : descr}
        #if mimetype == 'application/zip' or filename.lower().endswith('.zip'):
        if mimetype == 'application/zip' or badExt in constants.NOTAB:
            mimetype = 'application/octet-stream' # stop unzipping automatically
            filename += '.NOPROCESS' # Also screw with their naming convention
            #debug log about file names to see what is up with XSLX
            #see doi:10.5061/dryad.z8w9ghxb6
            LOGGER.debug('File renamed to %s for upload', filename)
        if size >= constants.MAX_UPLOAD:
            fail = (fid, {'status' : 'Failure: MAX_UPLOAD size exceeded'})
            self.fileUpRecord.append(fail)
            LOGGER.warning('%s: File %s of '
                           'size %s exceeds '
                           'Dataverse MAX_UPLOAD size. Skipping.', self.doi, filename, size)
            return fail

        fields = {'file': (filename, open(upfile, 'rb'), mimetype)}
        fields.update({'jsonData': f'{dv4meta}'})
        multi = MultipartEncoder(fields=fields)
        ctype = {'Content-type' : multi.content_type}
        tmphead = self.auth.copy()
        tmphead.update(ctype)
        tmphead.update(USER_AGENT)
        url = dest + '/api/datasets/:persistentId/add'
        try:
            upload = self.session.post(url, params=params,
                                       headers=tmphead,
                                       data=multi, timeout=timeout)
            #print(upload.text)
            upload.raise_for_status()
            self.fileUpRecord.append((fid, upload.json()))
            upmd5 = upload.json()['data']['files'][0]['dataFile']['checksum']['value']
            #Dataverse hash type
            _type = upload.json()['data']['files'][0]['dataFile']['checksum']['type']
            if _type.lower() != hashtype.lower():
                comparator = self._check_md5(upfile, _type.lower())
            else:
                comparator = digest
            #if hashtype.lower () != 'md5':
            #    #get an md5 because dataverse uses md5s. Or most of them do anyway.
            #    #One day this will be rewritten properly.
            #    md5 = self._check_md5(filename, 'md5')
            #else:
            #    md5 = digest
            #if md5 and (upmd5 != md5):
            if upmd5 != comparator:
                try:
                    raise exceptions.HashError(f'{_type} mismatch:\nlocal: {comparator}\nuploaded: {upmd5}')
                except exceptions.HashError as e:
                    LOGGER.exception(e)
                    raise

            #Make damn sure that the study isn't locked because of
            #tab file processing
            ##SPSS files still process despite spoofing MIME and extension
            ##so there's also a forcible unlock check

            #fid = upload.json()['data']['files'][0]['dataFile']['id']
            #fid not required for unlock
            #self.force_notab_unlock(studyId, dest, fid)
            if force_unlock:
                self.force_notab_unlock(studyId, dest)
            else:
                count = 0
                wait = True
                while wait:
                    wait = self.file_lock_check(studyId, dest, count=count)
                    if wait:
                        time.sleep(15) # Don't hit it too often
                    count += 1


            return (fid, upload.json())

        except Exception as e:
            LOGGER.exception(e)
            try:
                reason = upload.json()['message']
                LOGGER.warning(upload.json())
                return (fid, {'status' : f'Failure: {reason}'})
            except Exception as e:
                LOGGER.warning('Further exceptions!')
                LOGGER.exception(e)
                LOGGER.warning(upload.text)
                return (fid, {'status' : f'Failure: Reason {upload.reason}'})

    def upload_files(self, files=None, pid=None, fprefix=None, force_unlock=False):
        '''
        Uploads multiple files to study with persistentId pid.
        Returns a list of the original tuples plus JSON responses.

        Parameters
        ----------
        files : list
            List contains tuples with
            (dryadDownloadURL, filename, mimetype, size).
        pid : str
            Defaults to self.dvpid, which is generated by calling
            dryad2dataverse.transfer.Transfer.upload_study().
        fprefix : str
            File location prefix.
            Defaults to dryad2dataverse.constants.TMP
        force_unlock : bool
            Attempt forcible unlock instead of waiting for tabular
            file processing.
            Defaults to False.
            The Dataverse `/locks` endpoint blocks POST and DELETE requests
            from non-superusers (undocumented as of 31 March 2021).
            **Forcible unlock requires a superuser API key.**
        '''
        if not files:
            files = self.files
        if not fprefix:
            fprefix = constants.TMP
        out = []
        for f in files:
            #out.append(self.upload_file(f[0], f[1], f[2], f[3],
            #                             f[4], f[5], pid, fprefix=fprefix))
            #out.append(self.upload_file(*[x for x in f],
            #last item in files is not necessary
            out.append(self.upload_file(*list(f)[:-1],
                                        studyId=pid, fprefix=fprefix,
                                        force_unlock=force_unlock))
        return out

    def upload_json(self, studyId=None, dest=None):
        '''
        Uploads Dryad json as a separate file for archival purposes.

        Parameters
        ----------
        studyId : str
            Dataverse persistent identifier.
            Default dryad2dataverse.transfer.Transfer.dvpid,
            which is only generated on
            dryad2dataverse.transfer.Transfer.upload_study()
        dest : str
            Base URL for transfer.
            Default dryad2datavese.constants.DVURL
        '''
        if not studyId:
            studyId = self.dvpid
        if not dest:
            dest = constants.DVURL
        if not self.jsonFlag:
            url = dest + '/api/datasets/:persistentId/add'
            pack = io.StringIO(json.dumps(self.dryad.dryadJson))
            desc = {'description':'Original JSON from Dryad',
                    'categories':['Documentation', 'Code']}
            fname = self.doi[self.doi.rfind('/')+1:].replace('.', '_')
            payload = {'file': (f'{fname}.json', pack, 'text/plain;charset=UTF-8'),
                       'jsonData':f'{desc}'}
            params = {'persistentId':studyId}
            try:
                meta = self.session.post(f'{url}',
                                         params=params,
                                         headers=self.auth,
                                         files=payload)
                #0 because no dryad fid will be zero
                meta.raise_for_status()
                self.fileUpRecord.append((0, meta.json()))
                self.jsonFlag = (0, meta.json())
                LOGGER.debug('Successfully uploaded Dryad JSON to %s', studyId)

            #JSON uploads randomly fail with a Dataverse server.log error of
            #"A system exception occurred during an invocation on EJB . . ."
            #Not reproducible, so errors will only be written to the log.
            #Jesus.
            except (requests.exceptions.HTTPError,
                    requests.exceptions.ConnectionError) as err:
                LOGGER.error('Unable to upload Dryad JSON to %s', studyId)
                LOGGER.error('ERROR message: %s', meta.text)
                LOGGER.exception(err)
                #And further checking as to what is happening
                self.fileUpRecord.append((0, {'status':'Failure: Unable to upload Dryad JSON'}))
                if not isinstance(self.dryad.dryadJson, dict):
                    LOGGER.error('Dryad JSON is not a dictionary')
            except Exception as err:
                LOGGER.error('Unable to upload Dryad JSON')
                LOGGER.exception(err)

    def delete_dv_file(self, dvfid, dvurl=None, key=None)->bool:
        #WTAF curl -u $API_TOKEN: -X DELETE
        #https://$HOSTNAME/dvn/api/data-deposit/v1.1/swordv2/edit-media/file/123

        '''
        Deletes files from Dataverse target given a dataverse file ID.
        This information is unknowable unless discovered by
        dryad2dataverse.monitor.Monitor or by other methods.

        Returns 1 on success (204 response), or 0 on other response.

        Parameters
        ----------
        dvurl : str
            Base URL of dataverse instance.
            Defaults to dryad2dataverse.constants.DVURL.
        dvfid : str
            Dataverse file ID number.
        key : str
            API key
        '''
        if not dvurl:
            dvurl = constants.DVURL
        if not key:
            key = constants.APIKEY

        delme = self.session.delete(f'{dvurl}/dvn/api/data-deposit/v1.1/swordv2/edit-media'
                                    f'/file/{dvfid}',
                                    auth=(key, ''))
        if delme.status_code == 204:
            self.fileDelRecord.append(dvfid)
            return 1
        return 0

    def delete_dv_files(self, dvfids=None, dvurl=None, key=None):
        '''
        Deletes all files in list of Dataverse file ids from
        a Dataverse installation.

        Parameters
        ----------
        dvfids : list
            List of Dataverse file ids.
            Defaults to dryad2dataverse.transfer.Transfer.fileDelRecord.
        dvurl : str
            Base URL of Dataverse. Defaults to dryad2dataverse.constants.DVURL.
        key : str
            API key for Dataverse. Defaults to dryad2dataverse.constants.APIKEY.
        '''
        #if not dvfids:
        #   dvfids = self.fileDelRecord
        if not dvurl:
            dvurl = constants.DVURL
        if not key:
            key = constants.APIKEY
        for fid in dvfids:
            self.delete_dv_file(fid, dvurl, key)

auth property

Returns datavese authentication header dict. ie: {X-Dataverse-key' : 'APIKEYSTRING'}

doi property

Returns Dryad DOI.

dvpid property

Returns Dataverse study persistent ID as str.

fileJson property

Returns a list of file JSONs from call to Dryad API /files/{id}, where the ID is parsed from the Dryad JSON. Dryad file listings are paginated.

files property

Returns a list of lists with:

[Download_location, filename, mimetype, size, description, md5digest]

This is mutable; downloading a file will add md5 info if not available.

oversize property

Returns list of files exceeding Dataverse ingest limit dryad2dataverse.constants.MAX_UPLOAD.

__init__(dryad)

Creates a dryad2dataverse.transfer.Transfer instance.

Parameters:
Source code in src/dryad2dataverse/transfer.py
def __init__(self, dryad):
    '''
    Creates a dryad2dataverse.transfer.Transfer instance.

    Parameters
    ----------
    dryad : dryad2dataverse.serializer.Serializer
    '''
    self.dryad = dryad
    self._fileJson = None
    self._files = [list(f) for f in self.dryad.files]
    #self._files = copy.deepcopy(self.dryad.files)
    self.fileUpRecord = []
    self.fileDelRecord = []
    self.dvStudy = None
    self.jsonFlag = None #Whether or not new json uploaded
    self.session = requests.Session()
    self.session.mount('https://', HTTPAdapter(max_retries=constants.RETRY_STRATEGY))

delete_dv_file(dvfid, dvurl=None, key=None)

Deletes files from Dataverse target given a dataverse file ID. This information is unknowable unless discovered by dryad2dataverse.monitor.Monitor or by other methods.

Returns 1 on success (204 response), or 0 on other response.

Parameters:
  • dvurl (str, default: None ) –

    Base URL of dataverse instance. Defaults to dryad2dataverse.constants.DVURL.

  • dvfid (str) –

    Dataverse file ID number.

  • key (str, default: None ) –

    API key

Source code in src/dryad2dataverse/transfer.py
def delete_dv_file(self, dvfid, dvurl=None, key=None)->bool:
    #WTAF curl -u $API_TOKEN: -X DELETE
    #https://$HOSTNAME/dvn/api/data-deposit/v1.1/swordv2/edit-media/file/123

    '''
    Deletes files from Dataverse target given a dataverse file ID.
    This information is unknowable unless discovered by
    dryad2dataverse.monitor.Monitor or by other methods.

    Returns 1 on success (204 response), or 0 on other response.

    Parameters
    ----------
    dvurl : str
        Base URL of dataverse instance.
        Defaults to dryad2dataverse.constants.DVURL.
    dvfid : str
        Dataverse file ID number.
    key : str
        API key
    '''
    if not dvurl:
        dvurl = constants.DVURL
    if not key:
        key = constants.APIKEY

    delme = self.session.delete(f'{dvurl}/dvn/api/data-deposit/v1.1/swordv2/edit-media'
                                f'/file/{dvfid}',
                                auth=(key, ''))
    if delme.status_code == 204:
        self.fileDelRecord.append(dvfid)
        return 1
    return 0

delete_dv_files(dvfids=None, dvurl=None, key=None)

Deletes all files in list of Dataverse file ids from a Dataverse installation.

Parameters:
  • dvfids (list, default: None ) –

    List of Dataverse file ids. Defaults to dryad2dataverse.transfer.Transfer.fileDelRecord.

  • dvurl (str, default: None ) –

    Base URL of Dataverse. Defaults to dryad2dataverse.constants.DVURL.

  • key (str, default: None ) –

    API key for Dataverse. Defaults to dryad2dataverse.constants.APIKEY.

Source code in src/dryad2dataverse/transfer.py
def delete_dv_files(self, dvfids=None, dvurl=None, key=None):
    '''
    Deletes all files in list of Dataverse file ids from
    a Dataverse installation.

    Parameters
    ----------
    dvfids : list
        List of Dataverse file ids.
        Defaults to dryad2dataverse.transfer.Transfer.fileDelRecord.
    dvurl : str
        Base URL of Dataverse. Defaults to dryad2dataverse.constants.DVURL.
    key : str
        API key for Dataverse. Defaults to dryad2dataverse.constants.APIKEY.
    '''
    #if not dvfids:
    #   dvfids = self.fileDelRecord
    if not dvurl:
        dvurl = constants.DVURL
    if not key:
        key = constants.APIKEY
    for fid in dvfids:
        self.delete_dv_file(fid, dvurl, key)

download_file(url=None, filename=None, tmp=None, size=None, chk=None, timeout=45, **kwargs)

Downloads a file via requests streaming and saves to constants.TMP. returns checksum on success and an exception on failure.

Parameters:
  • url (str, default: None ) –

    URL of download.

  • filename (str, default: None ) –

    Output file name.

  • timeout (int, default: 45 ) –

    Requests timeout.

  • tmp (str, default: None ) –

    Temporary directory for downloads. Defaults to dryad2dataverse.constants.TMP.

  • size (int, default: None ) –

    Reported file size in bytes. Defaults to dryad2dataverse.constants.MAX_UPLOAD.

  • chk (str, default: None ) –

    checksum of file (if available and known).

  • timeout (int, default: 45 ) –

    timeout in seconds

  • kwargs (dict, default: {} ) –
  • digest_type (str) –

    checksum type (ie, md5, sha-256, etc)

Source code in src/dryad2dataverse/transfer.py
def download_file(self, url=None, filename=None, tmp=None,
                  size=None, chk=None, timeout=45, **kwargs):
    '''
    Downloads a file via requests streaming and saves to constants.TMP.
    returns checksum on success and an exception on failure.

    Parameters
    ----------
    url : str
        URL of download.
    filename : str
        Output file name.
    timeout : int
        Requests timeout.
    tmp : str
        Temporary directory for downloads.
        Defaults to dryad2dataverse.constants.TMP.
    size : int
        Reported file size in bytes.
        Defaults to dryad2dataverse.constants.MAX_UPLOAD.
    chk : str
        checksum of file (if available and known).
    timeout : int
        timeout in seconds
    kwargs : dict

    Other parameters
    ----------------
    digest_type : str
        checksum type (ie, md5, sha-256, etc)
    '''
    LOGGER.debug('Start download sequence')
    LOGGER.debug('MAX SIZE = %s', constants.MAX_UPLOAD)
    LOGGER.debug('Filename: %s, size=%s', filename, size)
    if not tmp:
        tmp = constants.TMP
    if tmp.endswith(os.sep):
        tmp = tmp[:-1]

    if size:
        if size > constants.MAX_UPLOAD:
            #TOO BIG
            LOGGER.warning('%s: File %s exceeds '
                           'Dataverse MAX_UPLOAD size. Skipping download.',
                           self.doi, filename)
            md5 = 'this_file_is_too_big_to_upload__' #HA HA
            for i in self._files:
                if url == i[0]:
                    i[-1] = md5
            LOGGER.debug('Stop download sequence with large file skip')
            return md5
    try:
        down = self.session.get(url, timeout=timeout, stream=True)
        down.raise_for_status()
        with open(f'{tmp}{os.sep}{filename}', 'wb') as fi:
            for chunk in down.iter_content(chunk_size=8192):
                fi.write(chunk)

        #verify size
        #https://stackoverflow.com/questions/2104080/how-can-i-check-file-size-in-python'
        if size:
            checkSize = os.stat(f'{tmp}{os.sep}{filename}').st_size
            if checkSize != size:
                try:
                    raise exceptions.DownloadSizeError('Download size does not match '
                                                       'reported size')
                except exceptions.DownloadSizeError as e:
                    LOGGER.exception(e)
                    raise
        #now check the md5
        md5 = None
        if chk and kwargs.get('digest_type') in HASHTABLE:
            md5 = Transfer._check_md5(f'{tmp}{os.sep}{filename}',
                                  kwargs['digest_type'])
            if md5 != chk:
                try:
                    raise exceptions.HashError(f'Hex digest mismatch: {md5} : {chk}')
                    #is this really what I want to do on a bad checksum?
                except exceptions.HashError as e:
                    LOGGER.exception(e)
                    raise
        for i in self._files:
            if url == i[0]:
                i[-1] = md5
        LOGGER.debug('Complete download sequence')
        #This doesn't actually return an md5, just the hash value
        return md5
    except (requests.exceptions.HTTPError,
            requests.exceptions.ConnectionError) as err:
        LOGGER.critical('Unable to download %s', url)
        LOGGER.exception(err)
        raise exceptions.DataverseDownloadError

download_files(files=None)

Bulk downloader for files.

Parameters:
  • files (list, default: None ) –

    Items in list can be tuples or list with a minimum of: (dryaddownloadurl, filenamewithoutpath, [md5sum]) The md5 sum should be the last member of the tuple. Defaults to self.files.

Notes

Normally used without arguments to download all the associated files with a Dryad study.

Source code in src/dryad2dataverse/transfer.py
def download_files(self, files=None):
    '''
    Bulk downloader for files.

    Parameters
    ----------
    files : list
        Items in list can be tuples or list with a minimum of:
        `(dryaddownloadurl, filenamewithoutpath, [md5sum])`
        The md5 sum should be the last member of the tuple.
        Defaults to self.files.

    Notes
    -----
    Normally used without arguments to download all the associated
    files with a Dryad study.
    '''
    if not files:
        files = self.files
    try:
        for f in files:
            self.download_file(url=f[0],
                               filename=f[1],
                               mimetype=f[2],
                               size=f[3],
                               descr=f[4],
                               digest_type=f[5],
                               chk=f[-1])
    except exceptions.DataverseDownloadError as e:
        LOGGER.exception('Unable to download file with info %s\n%s', f, e)
        raise

file_lock_check(study, dv_url, apikey=None, count=0)

Checks for a study lock

Returns True if locked. Normally used to check if processing is completed. As tabular processing halts file ingest, there should be no locks on a Dataverse study before performing a data file upload.

Parameters:
  • study (str) –

    Persistent indentifer of study.

  • dv_url (str) –

    URL to base Dataverse installation.

  • apikey (str, default: None ) –

    API key for user. If not present authorization defaults to self.auth.

  • count (int, default: 0 ) –

    Number of times the function has been called. Logs lock messages only on 0.

Source code in src/dryad2dataverse/transfer.py
def file_lock_check(self, study, dv_url, apikey=None, count=0):
    '''
    Checks for a study lock

    Returns True if locked. Normally used to check
    if processing is completed. As tabular processing
    halts file ingest, there should be no locks on a
    Dataverse study before performing a data file upload.

    Parameters
    ----------
    study : str
        Persistent indentifer of study.
    dv_url : str
        URL to base Dataverse installation.
    apikey : str
        API key for user.
        If not present authorization defaults to self.auth.
    count : int
        Number of times the function has been called. Logs
        lock messages only on 0.
    '''
    if dv_url.endswith('/'):
        dv_url = dv_url[:-1]
    if apikey:
        headers = {'X-Dataverse-key': apikey}
    else:
        headers = self.auth

    headers.update(USER_AGENT)
    params = {'persistentId': study}
    try:
        lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
                                       headers=headers,
                                       params=params, timeout=300)
        lock_status.raise_for_status()
        if lock_status.json().get('data'):
            if count == 0:
                LOGGER.warning('Study %s has been locked', study)
                LOGGER.warning('Lock info:\n%s', lock_status.json())
            return True
        return False
    except (requests.exceptions.HTTPError,
            requests.exceptions.ConnectionError) as err:
        LOGGER.error('Unable to detect lock status for %s', study)
        LOGGER.error('ERROR message: %s', lock_status.text)
        LOGGER.exception(err)
        #return True #Should I raise here?
        raise

force_notab_unlock(study, dv_url, apikey=None)

Checks for a study lock and forcibly unlocks and uningests to prevent tabular file processing. Required if mime and filename spoofing is not sufficient.

Forcible unlocks require a superuser API key.

Parameters:
  • study (str) –

    Persistent indentifer of study.

  • dv_url (str) –

    URL to base Dataverse installation.

  • apikey (str, default: None ) –

    API key for user. If not present authorization defaults to self.auth.

Source code in src/dryad2dataverse/transfer.py
def force_notab_unlock(self, study, dv_url, apikey=None):
    '''
    Checks for a study lock and forcibly unlocks and uningests
    to prevent tabular file processing. Required if mime and filename
    spoofing is not sufficient.

    **Forcible unlocks require a superuser API key.**

    Parameters
    ----------
    study : str
        Persistent indentifer of study.
    dv_url : str
        URL to base Dataverse installation.
    apikey : str
        API key for user.
        If not present authorization defaults to self.auth.
    '''
    if dv_url.endswith('/'):
        dv_url = dv_url[:-1]
    if apikey:
        headers = {'X-Dataverse-key': apikey}
    else:
        headers = self.auth

    headers.update(USER_AGENT)
    params = {'persistentId': study}
    lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
                                   headers=headers,
                                   params=params, timeout=300)
    lock_status.raise_for_status()
    if lock_status.json()['data']:
        LOGGER.warning('Study %s has been locked', study)
        LOGGER.warning('Lock info:\n%s', lock_status.json())
        force_unlock = self.session.delete(f'{dv_url}/api/datasets/:persistentId/locks',
                                           params=params, headers=headers,
                                           timeout=300)
        force_unlock.raise_for_status()
        LOGGER.warning('Lock removed for %s', study)
        LOGGER.warning('Lock status:\n %s', force_unlock.json())

set_correct_date(url=None, hdl=None, d_type='distributionDate', apikey=None)

Sets “correct” publication date for Dataverse.

Parameters:
  • url (str, default: None ) –

    Base URL to Dataverse installation. Defaults to dryad2dataverse.constants.DVURL

  • hdl (str, default: None ) –

    Persistent indentifier for Dataverse study. Defaults to Transfer.dvpid (which can be None if the study has not yet been uploaded).

  • d_type (str, default: 'distributionDate' ) –

    Date type. One of ‘distributionDate’, ‘productionDate’, `dateOfDeposit’. Default ‘distributionDate’.

  • apikey (str, default: None ) –

    Default dryad2dataverse.constants.APIKEY.

Notes

dryad2dataverse.serializer maps Dryad ‘publicationDate’ to Dataverse ‘distributionDate’ (see serializer.py ~line 675).

Dataverse citation date default is “:publicationDate”. See Dataverse API reference: https://guides.dataverse.org/en/4.20/api/native-api.html#id54.

Source code in src/dryad2dataverse/transfer.py
def set_correct_date(self, url=None, hdl=None,
                     d_type='distributionDate',
                     apikey=None):
    '''
    Sets "correct" publication date for Dataverse.

    Parameters
    ----------
    url : str
        Base URL to Dataverse installation.
        Defaults to dryad2dataverse.constants.DVURL
    hdl : str
        Persistent indentifier for Dataverse study.
        Defaults to Transfer.dvpid (which can be None if the
        study has not yet been uploaded).
    d_type : str
        Date type. One of  'distributionDate', 'productionDate',
        `dateOfDeposit'. Default 'distributionDate'.
    apikey : str
        Default dryad2dataverse.constants.APIKEY.

    Notes
    -----
    dryad2dataverse.serializer maps Dryad 'publicationDate'
    to Dataverse 'distributionDate' (see serializer.py ~line 675).

    Dataverse citation date default is ":publicationDate". See
    Dataverse API reference:
    <https://guides.dataverse.org/en/4.20/api/native-api.html#id54>.

    '''
    try:
        if not url:
            url = constants.DVURL
        if not hdl:
            hdl = self.dvpid
        headers = {'X-Dataverse-key' : apikey}
        if apikey:
            headers = {'X-Dataverse-key' : apikey}
        else:
            headers = {'X-Dataverse-key' : constants.APIKEY}

        headers.update(USER_AGENT)
        params = {'persistentId': hdl}
        set_date = self.session.put(f'{url}/api/datasets/:persistentId/citationdate',
                                    headers=headers,
                                    data=d_type,
                                    params=params,
                                    timeout=45)
        set_date.raise_for_status()

    except (requests.exceptions.HTTPError,
            requests.exceptions.ConnectionError) as err:
        LOGGER.warning('Unable to set citation date for %s',
                       hdl)
        LOGGER.warning(err)
        LOGGER.warning(set_date.text)

test_api_key(url=None, apikey=None)

Tests for an expired API key and raises dryad2dataverse.exceptions.Dryad2dataverseBadApiKeyError the API key is bad. Ignores other HTTP errors.

Parameters:
  • url (str, default: None ) –

    Base URL to Dataverse installation. Defaults to dryad2dataverse.constants.DVURL

  • apikey (str, default: None ) –

    Default dryad2dataverse.constants.APIKEY.

Source code in src/dryad2dataverse/transfer.py
def test_api_key(self, url=None, apikey=None):
    '''
    Tests for an expired API key and raises
    dryad2dataverse.exceptions.Dryad2dataverseBadApiKeyError
    the API key is bad. Ignores other HTTP errors.

    Parameters
    ----------
    url : str
        Base URL to Dataverse installation.
        Defaults to dryad2dataverse.constants.DVURL
    apikey : str
        Default dryad2dataverse.constants.APIKEY.
    '''
    #API validity check appears to come before a PID validity check
    params = {'persistentId': 'doi:000/000/000'} # PID is irrelevant
    if not url:
        url = constants.DVURL
    headers = {'X-Dataverse-key': apikey if apikey else constants.APIKEY}
    headers.update(USER_AGENT)
    bad_test = self.session.get(f'{url}/api/datasets/:persistentId',
                            headers=headers,
                            params=params)
    #There's an extra space in the message which Harvard
    #will probably find out about, so . . .
    if bad_test.json().get('message').startswith('Bad api key'):
        try:
            raise exceptions.DataverseBadApiKeyError('Bad API key')
        except exceptions.DataverseBadApiKeyError as e:
            LOGGER.critical('API key has expired or is otherwise invalid')
            LOGGER.exception(e)
            #LOGGER.exception(traceback.format_exc()) #not really necessary
            raise
    try: #other errors
        bad_test.raise_for_status()
    except requests.exceptions.HTTPError:
        pass
    except Exception as e:
        LOGGER.exception(e)
        LOGGER.exception(traceback.format_exc())
        raise

upload_file(dryadUrl=None, filename=None, mimetype=None, size=None, descr=None, hashtype=None, digest=None, studyId=None, dest=None, fprefix=None, force_unlock=False, timeout=300)

Uploads file to Dataverse study. Returns a tuple of the dryadFid (or None) and Dataverse JSON from the POST request. Failures produce JSON with different status messages rather than raising an exception.

Parameters:
  • filename (str, default: None ) –

    Filename (not including path).

  • mimetype (str, default: None ) –

    Mimetype of file.

  • size (int, default: None ) –

    Size in bytes.

  • studyId (str, default: None ) –

    Persistent Dataverse study identifier. Defaults to Transfer.dvpid.

  • dest (str, default: None ) –

    Destination dataverse installation url. Defaults to constants.DVURL.

  • hashtype

    original Dryad hash type

  • fprefix (str, default: None ) –

    Path to file, not including a trailing slash.

  • timeout (int, default: 300 ) –

    Timeout in seconds for POST request. Default 300.

  • dryadUrl (str, default: None ) –

    Dryad download URL if you want to include a Dryad file id.

  • force_unlock (bool, default: False ) –

    Attempt forcible unlock instead of waiting for tabular file processing. Defaults to False. The Dataverse /locks endpoint blocks POST and DELETE requests from non-superusers (undocumented as of 31 March 2021). Forcible unlock requires a superuser API key.

Source code in src/dryad2dataverse/transfer.py
def upload_file(self, dryadUrl=None, filename=None,
                mimetype=None, size=None, descr=None,
                hashtype=None,
                #md5=None, studyId=None, dest=None,
                digest=None, studyId=None, dest=None,
                fprefix=None, force_unlock=False, timeout=300):
    '''
    Uploads file to Dataverse study. Returns a tuple of the
    dryadFid (or None) and Dataverse JSON from the POST request.
    Failures produce JSON with different status messages
    rather than raising an exception.

    Parameters
    ----------
    filename : str
        Filename (not including path).
    mimetype : str
        Mimetype of file.
    size : int
        Size in bytes.
    studyId : str
        Persistent Dataverse study identifier.
        Defaults to Transfer.dvpid.
    dest : str
        Destination dataverse installation url.
        Defaults to constants.DVURL.
    hashtype: str
        original Dryad hash type
    fprefix : str
        Path to file, not including a trailing slash.
    timeout : int
        Timeout in seconds for POST request. Default 300.
    dryadUrl : str
        Dryad download URL if you want to include a Dryad file id.
    force_unlock : bool
        Attempt forcible unlock instead of waiting for tabular
        file processing.
        Defaults to False.
        The Dataverse `/locks` endpoint blocks POST and DELETE requests
        from non-superusers (undocumented as of 31 March 2021).
        **Forcible unlock requires a superuser API key.**
    '''
    #return locals()
    #TODONE remove above
    if not studyId:
        studyId = self.dvpid
    if not dest:
        dest = constants.DVURL
    if not fprefix:
        fprefix = constants.TMP
    if dryadUrl:
        fid = dryadUrl.strip('/download')
        fid = int(fid[fid.rfind('/')+1:])
    else:
        fid = 0 #dummy fid for non-Dryad use
    params = {'persistentId' : studyId}
    upfile = fprefix + os.sep + filename[:]
    badExt = filename[filename.rfind('.'):].lower()
    #Descriptions are technically possible, although how to add
    #them is buried in Dryad's API documentation
    dv4meta = {'label' : filename[:], 'description' : descr}
    #if mimetype == 'application/zip' or filename.lower().endswith('.zip'):
    if mimetype == 'application/zip' or badExt in constants.NOTAB:
        mimetype = 'application/octet-stream' # stop unzipping automatically
        filename += '.NOPROCESS' # Also screw with their naming convention
        #debug log about file names to see what is up with XSLX
        #see doi:10.5061/dryad.z8w9ghxb6
        LOGGER.debug('File renamed to %s for upload', filename)
    if size >= constants.MAX_UPLOAD:
        fail = (fid, {'status' : 'Failure: MAX_UPLOAD size exceeded'})
        self.fileUpRecord.append(fail)
        LOGGER.warning('%s: File %s of '
                       'size %s exceeds '
                       'Dataverse MAX_UPLOAD size. Skipping.', self.doi, filename, size)
        return fail

    fields = {'file': (filename, open(upfile, 'rb'), mimetype)}
    fields.update({'jsonData': f'{dv4meta}'})
    multi = MultipartEncoder(fields=fields)
    ctype = {'Content-type' : multi.content_type}
    tmphead = self.auth.copy()
    tmphead.update(ctype)
    tmphead.update(USER_AGENT)
    url = dest + '/api/datasets/:persistentId/add'
    try:
        upload = self.session.post(url, params=params,
                                   headers=tmphead,
                                   data=multi, timeout=timeout)
        #print(upload.text)
        upload.raise_for_status()
        self.fileUpRecord.append((fid, upload.json()))
        upmd5 = upload.json()['data']['files'][0]['dataFile']['checksum']['value']
        #Dataverse hash type
        _type = upload.json()['data']['files'][0]['dataFile']['checksum']['type']
        if _type.lower() != hashtype.lower():
            comparator = self._check_md5(upfile, _type.lower())
        else:
            comparator = digest
        #if hashtype.lower () != 'md5':
        #    #get an md5 because dataverse uses md5s. Or most of them do anyway.
        #    #One day this will be rewritten properly.
        #    md5 = self._check_md5(filename, 'md5')
        #else:
        #    md5 = digest
        #if md5 and (upmd5 != md5):
        if upmd5 != comparator:
            try:
                raise exceptions.HashError(f'{_type} mismatch:\nlocal: {comparator}\nuploaded: {upmd5}')
            except exceptions.HashError as e:
                LOGGER.exception(e)
                raise

        #Make damn sure that the study isn't locked because of
        #tab file processing
        ##SPSS files still process despite spoofing MIME and extension
        ##so there's also a forcible unlock check

        #fid = upload.json()['data']['files'][0]['dataFile']['id']
        #fid not required for unlock
        #self.force_notab_unlock(studyId, dest, fid)
        if force_unlock:
            self.force_notab_unlock(studyId, dest)
        else:
            count = 0
            wait = True
            while wait:
                wait = self.file_lock_check(studyId, dest, count=count)
                if wait:
                    time.sleep(15) # Don't hit it too often
                count += 1


        return (fid, upload.json())

    except Exception as e:
        LOGGER.exception(e)
        try:
            reason = upload.json()['message']
            LOGGER.warning(upload.json())
            return (fid, {'status' : f'Failure: {reason}'})
        except Exception as e:
            LOGGER.warning('Further exceptions!')
            LOGGER.exception(e)
            LOGGER.warning(upload.text)
            return (fid, {'status' : f'Failure: Reason {upload.reason}'})

upload_files(files=None, pid=None, fprefix=None, force_unlock=False)

Uploads multiple files to study with persistentId pid. Returns a list of the original tuples plus JSON responses.

Parameters:
  • files (list, default: None ) –

    List contains tuples with (dryadDownloadURL, filename, mimetype, size).

  • pid (str, default: None ) –

    Defaults to self.dvpid, which is generated by calling dryad2dataverse.transfer.Transfer.upload_study().

  • fprefix (str, default: None ) –

    File location prefix. Defaults to dryad2dataverse.constants.TMP

  • force_unlock (bool, default: False ) –

    Attempt forcible unlock instead of waiting for tabular file processing. Defaults to False. The Dataverse /locks endpoint blocks POST and DELETE requests from non-superusers (undocumented as of 31 March 2021). Forcible unlock requires a superuser API key.

Source code in src/dryad2dataverse/transfer.py
def upload_files(self, files=None, pid=None, fprefix=None, force_unlock=False):
    '''
    Uploads multiple files to study with persistentId pid.
    Returns a list of the original tuples plus JSON responses.

    Parameters
    ----------
    files : list
        List contains tuples with
        (dryadDownloadURL, filename, mimetype, size).
    pid : str
        Defaults to self.dvpid, which is generated by calling
        dryad2dataverse.transfer.Transfer.upload_study().
    fprefix : str
        File location prefix.
        Defaults to dryad2dataverse.constants.TMP
    force_unlock : bool
        Attempt forcible unlock instead of waiting for tabular
        file processing.
        Defaults to False.
        The Dataverse `/locks` endpoint blocks POST and DELETE requests
        from non-superusers (undocumented as of 31 March 2021).
        **Forcible unlock requires a superuser API key.**
    '''
    if not files:
        files = self.files
    if not fprefix:
        fprefix = constants.TMP
    out = []
    for f in files:
        #out.append(self.upload_file(f[0], f[1], f[2], f[3],
        #                             f[4], f[5], pid, fprefix=fprefix))
        #out.append(self.upload_file(*[x for x in f],
        #last item in files is not necessary
        out.append(self.upload_file(*list(f)[:-1],
                                    studyId=pid, fprefix=fprefix,
                                    force_unlock=force_unlock))
    return out

upload_json(studyId=None, dest=None)

Uploads Dryad json as a separate file for archival purposes.

Parameters:
  • studyId (str, default: None ) –

    Dataverse persistent identifier. Default dryad2dataverse.transfer.Transfer.dvpid, which is only generated on dryad2dataverse.transfer.Transfer.upload_study()

  • dest (str, default: None ) –

    Base URL for transfer. Default dryad2datavese.constants.DVURL

Source code in src/dryad2dataverse/transfer.py
def upload_json(self, studyId=None, dest=None):
    '''
    Uploads Dryad json as a separate file for archival purposes.

    Parameters
    ----------
    studyId : str
        Dataverse persistent identifier.
        Default dryad2dataverse.transfer.Transfer.dvpid,
        which is only generated on
        dryad2dataverse.transfer.Transfer.upload_study()
    dest : str
        Base URL for transfer.
        Default dryad2datavese.constants.DVURL
    '''
    if not studyId:
        studyId = self.dvpid
    if not dest:
        dest = constants.DVURL
    if not self.jsonFlag:
        url = dest + '/api/datasets/:persistentId/add'
        pack = io.StringIO(json.dumps(self.dryad.dryadJson))
        desc = {'description':'Original JSON from Dryad',
                'categories':['Documentation', 'Code']}
        fname = self.doi[self.doi.rfind('/')+1:].replace('.', '_')
        payload = {'file': (f'{fname}.json', pack, 'text/plain;charset=UTF-8'),
                   'jsonData':f'{desc}'}
        params = {'persistentId':studyId}
        try:
            meta = self.session.post(f'{url}',
                                     params=params,
                                     headers=self.auth,
                                     files=payload)
            #0 because no dryad fid will be zero
            meta.raise_for_status()
            self.fileUpRecord.append((0, meta.json()))
            self.jsonFlag = (0, meta.json())
            LOGGER.debug('Successfully uploaded Dryad JSON to %s', studyId)

        #JSON uploads randomly fail with a Dataverse server.log error of
        #"A system exception occurred during an invocation on EJB . . ."
        #Not reproducible, so errors will only be written to the log.
        #Jesus.
        except (requests.exceptions.HTTPError,
                requests.exceptions.ConnectionError) as err:
            LOGGER.error('Unable to upload Dryad JSON to %s', studyId)
            LOGGER.error('ERROR message: %s', meta.text)
            LOGGER.exception(err)
            #And further checking as to what is happening
            self.fileUpRecord.append((0, {'status':'Failure: Unable to upload Dryad JSON'}))
            if not isinstance(self.dryad.dryadJson, dict):
                LOGGER.error('Dryad JSON is not a dictionary')
        except Exception as err:
            LOGGER.error('Unable to upload Dryad JSON')
            LOGGER.exception(err)

upload_study(url=None, apikey=None, timeout=45, **kwargs)

Uploads Dryad study metadata to target Dataverse or updates existing. Supplying a targetDv kwarg creates a new study and supplying a dvpid kwarg updates a currently existing Dataverse study.

Parameters:
  • url (str, default: None ) –

    URL of Dataverse instance. Defaults to constants.DVURL.

  • apikey (str, default: None ) –

    API key of user. Defaults to contants.APIKEY.

  • timeout (int, default: 45 ) –

    timeout on POST request.

  • kwargs (dict, default: {} ) –
  • targetDv (str) –

    Short name of target dataverse. Required if new dataset. Specify as targetDV=value.

  • dvpid (str) –

    Dataverse persistent ID (for updating metadata). This is not required for new uploads, specify as dvpid=value

Notes

One of targetDv or dvpid is required.

Source code in src/dryad2dataverse/transfer.py
def upload_study(self, url=None, apikey=None, timeout=45, **kwargs):
    '''
    Uploads Dryad study metadata to target Dataverse or updates existing.
    Supplying a `targetDv` kwarg creates a new study and supplying a
    `dvpid` kwarg updates a currently existing Dataverse study.

    Parameters
    ----------
    url : str
        URL of Dataverse instance. Defaults to constants.DVURL.
    apikey : str
        API key of user. Defaults to contants.APIKEY.
    timeout : int
        timeout on POST request.
    kwargs : dict

    Other parameters
    ----------------
    targetDv : str
        Short name of target dataverse. Required if new dataset.
        Specify as targetDV=value.
    dvpid : str
        Dataverse persistent ID (for updating metadata).
        This is not required for new uploads, specify as dvpid=value

    Notes
    -----
    One of targetDv or dvpid is required.
    '''
    if not url:
        url = constants.DVURL
    if not apikey:
        apikey = constants.APIKEY
    headers = {'X-Dataverse-key' : apikey}
    headers.update(USER_AGENT)
    targetDv = kwargs.get('targetDv')
    dvpid = kwargs.get('dvpid')
    #dryFid = kwargs.get('dryFid') #Why did I put this here?
    if not targetDv and not dvpid:
        try:
            raise exceptions.NoTargetError('You must supply one of targetDv \
                                (target dataverse) \
                                 or dvpid (Dataverse persistent ID)')
        except exceptions.NoTargetError as e:
            LOGGER.error('No target dataverse or dvpid supplied')
            LOGGER.exception(e)
            raise

    if targetDv and dvpid:
        try:
            raise ValueError('Supply only one of targetDv or dvpid')
        except ValueError as e:
            LOGGER.exception(e)
            raise
    if not dvpid:
        endpoint = f'{url}/api/dataverses/{targetDv}/datasets'
        upload = self.session.post(endpoint,
                                   headers=headers,
                                   json=self.dryad.dvJson,
                                   timeout=timeout)
        LOGGER.debug(upload.text)
    else:
        endpoint = f'{url}/api/datasets/:persistentId/versions/:draft'
        params = {'persistentId':dvpid}
        #Yes, dataverse uses *different* json for edits
        upload = self.session.put(endpoint, params=params,
                                  headers=headers,
                                  json=self.dryad.dvJson['datasetVersion'],
                                  timeout=timeout)
        #self._dvrecord = upload.json()
        LOGGER.debug(upload.text)

    try:
        updata = upload.json()
        self.dvStudy = updata
        if updata.get('status') != 'OK':
            try:
                raise exceptions.DataverseUploadError(('Status return is not OK.'
                                                       f'{upload.status_code}: '
                                                       f'{upload.reason}. '
                                                       f'{upload.request.url} '
                                                       f'{upload.text}'))
            except exceptions.DataverseUploadError as e:
                LOGGER.exception(e)
                LOGGER.exception(traceback.format_exc())
                raise exceptions.DataverseUploadError(('Status return is not OK.'
                                                       f'{upload.status_code}: '
                                                       f'{upload.reason}. '
                                                       f'{upload.request.url} '
                                                       f'{upload.text}'))
        upload.raise_for_status()
    except Exception as e: # Only accessible via non-requests exception
        LOGGER.exception(e)
        LOGGER.exception(traceback.format_exc())
        raise

    if targetDv:
        self.dryad.dvpid = updata['data'].get('persistentId')
    if dvpid:
        self.dryad.dvpid = updata['data'].get('datasetPersistentId')
    return self.dvpid

dryad2dataverse.monitor

Dryad/Dataverse status tracker. Monitor creates a singleton object which writes to a SQLite database. Methods will (generally) take either a dryad2dataverse.serializer.Serializer instance or dryad2dataverse.transfer.Transfer instance

The monitor’s primary function is to allow for state checking for Dryad studies so that files and studies aren’t downloaded unneccessarily.

Monitor

The Monitor object is a tracker and database updater, so that Dryad files can be monitored and updated over time. Monitor is a singleton, but is not thread-safe.

Source code in src/dryad2dataverse/monitor.py
class Monitor():
    '''
    The Monitor object is a tracker and database updater, so that
    Dryad files can be monitored and updated over time. Monitor is a singleton,
    but is not thread-safe.
    '''
    __instance = None

    def __new__(cls, dbase=None, *args, **kwargs):
        '''
        Creates a new singleton instance of Monitor.

        Also creates a database if existing database is not present.

        Parameters
        ----------
        dbase : str
            Path to sqlite3 database. That is:
            /path/to/file.sqlite3

        *args : list
        **kwargs : dict
        '''
        if cls.__instance is None:
            cls.__instance = super(Monitor, cls).__new__(cls)
            cls.__instance.__initialized = False
            cls.dbase = dbase
            if not cls.dbase:
                cls.dbase = constants.DBASE
            cls.conn = sqlite3.Connection(cls.dbase)
            cls.cursor = cls.conn.cursor()
            create = ['CREATE TABLE IF NOT EXISTS dryadStudy \
                       (uid INTEGER PRIMARY KEY AUTOINCREMENT, \
                       doi TEXT, lastmoddate TEXT, dryadjson TEXT, \
                       dvjson TEXT);',
                       'CREATE TABLE IF NOT EXISTS dryadFiles \
                       (dryaduid INTEGER REFERENCES dryadStudy (uid), \
                       dryfilesjson TEXT);',
                       'CREATE TABLE IF NOT EXISTS dvStudy \
                       (dryaduid INTEGER references dryadStudy (uid), \
                       dvpid TEXT);',
                       'CREATE TABLE IF NOT EXISTS dvFiles \
                       (dryaduid INTEGER references dryadStudy (uid), \
                       dryfid INT, \
                       drymd5 TEXT, dvfid TEXT, dvmd5 TEXT, \
                       dvfilejson TEXT);',
                       'CREATE TABLE IF NOT EXISTS lastcheck \
                       (checkdate TEXT);',
                       'CREATE TABLE IF NOT EXISTS failed_uploads \
                       (dryaduid INTEGER references dryadstudy (uid), \
                       dryfid INT, status TEXT);'
                      ]

            for line in create:
                cls.cursor.execute(line)
            cls.conn.commit()
            LOGGER.info('Using database %s', cls.dbase)

        return cls.__instance

    def __init__(self, dbase=None, *args, **kwargs):
        # remove args and kwargs when you find out how init interacts with new.
        '''
        Initialize the Monitor instance if not instantiated already (ie, Monitor
        is a singleton).

        Parameters
        ----------
        dbase : str, default=dryad2datverse.constants.DBASE
            Complete path to desired location of tracking database
            (eg: /tmp/test.db).
        *args : list
        **kwargs : dict
        '''
        if self.__initialized:
            return
        self.__initialized = True
        if not dbase:
            self.dbase = constants.DBASE
        else:
            self.dbase = dbase

    def __del__(self):
        '''
        Commits all database transactions on object deletion and closes database.
        '''
        self.conn.commit()
        self.conn.close()

    @property
    def lastmod(self):
        '''
        Returns last modification date from monitor.dbase.
        '''
        self.cursor.execute('SELECT checkdate FROM lastcheck ORDER BY rowid DESC;')
        last_mod = self.cursor.fetchall()
        if last_mod:
            return last_mod[0][0]
        return None

    def status(self, serial)->dict:
        '''
        Returns a dictionary with keys 'status' and 'dvpid' and 'notes'.

        Parameters
        ----------
        serial :  dryad2dataverse.serializer.Serializer

        Returns
        -------
        `{status :'updated', 'dvpid':'doi://some/ident'}`.

        Notes
        ------
        `status` is one of 'new', 'identical',  'lastmodsame',
        'updated'

        'new' is a completely new file.

        'identical' The metadata from Dryad is *identical* to the last time
        the check was run.

        'lastmodsame' Dryad lastModificationDate ==  last modification date
        in database AND output JSON is different.
        This can indicate a Dryad
        API output change, reindexing or something else.
        But the lastModificationDate
        is supposed to be an indicator of meaningful change, so this option
        exists so you can decide what to do given this option

        'updated' Indicates changes to lastModificationDate

        Note that Dryad constantly changes their API output, so the changes
        may not actually be meaningful.

        `dvpid` is a Dataverse persistent identifier.
        `None` in the case of status='new'

        `notes`: value of Dryad versionChanges field. One of `files_changed` or
        `metatdata_changed`. Non-null value present only when status is
        not `new` or `identical`. Note that Dryad has no way to indicate *both*
        a file and metadata change, so this value reflects only the *last* change
        in the Dryad state.
        '''
        # Last mod date is indicator of change.
        # From email w/Ryan Scherle 10 Nov 2020
        #The versionNumber updates for either a metadata change or a
        #file change. Although we save all of these changes internally, our web
        #interface only displays the versions that have file changes, along
        #with the most recent metadata. So a dataset that has only two versions
        #of files listed on the web may actually have several more versions in
        #the API.
        #
        #If your only need is to track when there are changes to a
        #dataset, you may want to use the `lastModificationDate`, which we have
        #recently added to our metadata.
        #
        #Note that the Dryad API output ISN'T STABLE; they add fields etc.
        #This means that a comparison of JSON may yield differences even though
        #metadata is technically "the same". Just comparing two dicts doesn't cut
        #it.
        #############################
        ## Note: by inspection, Dryad outputs JSON that is different
        ## EVEN IF lastModificationDate is unchanged. (14 January 2022)
        ## So now what?
        #############################
        doi = serial.dryadJson['identifier']
        self.cursor.execute('SELECT * FROM dryadStudy WHERE doi = ?',
                            (doi,))
        result = self.cursor.fetchall()

        if not result:
            return {'status': 'new', 'dvpid': None, 'notes': ''}
        # dvjson = json.loads(result[-1][4])
        # Check the fresh vs. updated jsons for the keys
        try:
            dryaduid = result[-1][0]
            self.cursor.execute('SELECT dvpid from dvStudy WHERE \
                                 dryaduid = ?', (dryaduid,))
            dvpid = self.cursor.fetchall()[-1][0]
            serial.dvpid = dvpid
        except TypeError:
            try:
                raise exceptions.DatabaseError
            except exceptions.DatabaseError as e:
                LOGGER.error('Dryad DOI : %s. Error finding Dataverse PID', doi)
                LOGGER.exception(e)
                raise
        newfile = copy.deepcopy(serial.dryadJson)
        testfile = copy.deepcopy(json.loads(result[-1][3]))
        if newfile == testfile:
            return {'status': 'identical', 'dvpid': dvpid, 'notes': ''}
        if newfile['lastModificationDate'] != testfile['lastModificationDate']:
            return {'status': 'updated', 'dvpid': dvpid,
                    'notes': newfile['versionChanges']}
        return {'status': 'lastmodsame', 'dvpid': dvpid,
                     'notes': newfile.get('versionChanges')}

    def diff_metadata(self, serial):
        '''
        Analyzes differences in metadata between current serializer
        instance and last updated serializer instance.

        Parameters
        ----------
        serial : dryad2dataverse.serializer.Serializer

        Returns
        -------
        Returns a list of field changes consisting of:
        [{key: (old_value, new_value}] or None if no changes.

        Notes
        -----
        For example:
        ```
        [{'title':
        ('Cascading effects of algal warming in a freshwater community',
         'Cascading effects of algal warming in a freshwater community theatre')}
        ]
        ```
        '''
        if self.status(serial)['status'] == 'updated':
            self.cursor.execute('SELECT dryadjson from dryadStudy \
                                 WHERE doi = ?',
                                (serial.dryadJson['identifier'],))
            oldJson = json.loads(self.cursor.fetchall()[-1][0])
            out = []
            for k in serial.dryadJson:
                if serial.dryadJson[k] != oldJson.get(k):
                    out.append({k: (oldJson.get(k), serial.dryadJson[k])})
            return out

        return None

    @staticmethod
    def __added_hashes(oldFiles, newFiles):
        '''
        Checks that two objects in dryad2dataverse.serializer.files format
        stripped of digestType and digest values are identical. Returns array
        of files with changed hash.

        Assumes name, mimeType, size, descr all unchanged, which is not
        necessarily a valid assumption

        Parameters
        ----------
        oldFiles : Union[list, tuple]
            (name, mimeType, size, descr, digestType, digest)

        newFiles : Union[list, tuple]
            (name, mimeType, size, descr, digestType, digest)
        '''
        hash_change = []
        old = [x[1:-2] for x in oldFiles]
        #URLs are not permanent
        old_no_url = [x[1:] for x in oldFiles]
        for fi in newFiles:
            if fi[1:-2] in old and fi[1:] not in old_no_url:
                hash_change.append(fi)
        return hash_change


    def diff_files(self, serial):
        '''
        Returns a dict with additions and deletions from previous Dryad
        to dataverse upload.

        Because checksums are not necessarily included in Dryad file
        metadata, this method uses dryad file IDs, size, or
        whatever is available.

        If dryad2dataverse.monitor.Monitor.status()
        indicates a change it will produce dictionary output with a list
        of additions, deletions or hash changes (ie, identical
        except for hash changes), as below:

        `{'add':[dyadfiletuples], 'delete:[dryadfiletuples],
          'hash_change': [dryadfiletuples]}`

        Parameters
        ----------
        serial : dryad2dataverse.serializer.Serializer
        '''
        diffReport = {}
        if self.status(serial)['status'] == 'new':
            #do we want to show what needs to be added?
            return {'add': serial.files}
            #return {}
        self.cursor.execute('SELECT uid from dryadStudy WHERE doi = ?',
                            (serial.doi,))
        mostRecent = self.cursor.fetchall()[-1][0]
        self.cursor.execute('SELECT dryfilesjson from dryadFiles WHERE \
                             dryaduid = ?', (mostRecent,))
        oldFileList = self.cursor.fetchall()[-1][0]
        if not oldFileList:
            oldFileList = []
        else:
            out = []
            #With Dryad API change, files are paginated
            #now stored as list
            for old in json.loads(oldFileList):
            #for old in oldFileList:
                oldFiles = old['_embedded'].get('stash:files')
                # comparing file tuples from dryad2dataverse.serializer.
                # Maybe JSON is better?
                # because of code duplication below.
                for f in oldFiles:
                    #Download links are not persistent. Be warned
                    try:
                        downLink = f['_links']['stash:file-download']['href']
                    except KeyError:
                        downLink = f['_links']['stash:download']['href']
                    downLink = f'{constants.DRYURL}{downLink}'
                    name = f['path']
                    mimeType = f['mimeType']
                    size = f['size']
                    descr = f.get('description', '')
                    digestType = f.get('digestType', '')
                    digest = f.get('digest', '')
                    out.append((downLink, name, mimeType, size, descr, digestType, digest))
                oldFiles = out
        newFiles = serial.files[:]
        # Tests go here
        #Check for identity first
        #if returned here there are definitely no changes
        if (set(oldFiles).issuperset(set(newFiles)) and
                set(newFiles).issuperset(oldFiles)):
            return diffReport
        #filenames for checking hash changes.
        #Can't use URL or hashes for comparisons because they can change
        #without warning, despite the fact that the API says that
        #file IDs are unique. They aren't. Verified by Ryan Scherle at
        #Dryad December 2021
        old_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(oldFiles)}
        new_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(newFiles)}
        old_no_hash = [old_map[x]['no_hash'] for x in old_map]
        new_no_hash = [new_map[x]['no_hash'] for x in new_map]

        #check for added hash only
        hash_change = Monitor.__added_hashes(oldFiles, newFiles)

        must = set(old_no_hash).issuperset(set(new_no_hash))
        if not must:
            needsadd = set(new_no_hash) - (set(old_no_hash) & set(new_no_hash))
            #Use the map created above to return the full file info
            diffReport.update({'add': [new_map[new_no_hash.index(x)]['orig']
                                       for x in needsadd]})
        must = set(new_no_hash).issuperset(old_no_hash)
        if not must:
            needsdel = set(old_no_hash) - (set(new_no_hash) & set(old_no_hash))
            diffReport.update({'delete' : [old_map[old_no_hash.index(x)]['orig']
                                           for x in needsdel]})
        if hash_change:
            diffReport.update({'hash_change': hash_change})
        return diffReport

    def get_dv_fid(self, url):
        '''
        Returns str — the Dataverse file ID from parsing a Dryad
        file download link.  Normally used for determining dataverse
        file ids for *deletion* in case of dryad file changes.

        Parameters
        ----------
        url : str
            *Dryad* file URL in form of
            'https://datadryad.org/api/v2/files/385819/download'.
        '''
        fid = url[url.rfind('/', 0, -10)+1:].strip('/download')
        try:
            fid = int(fid)
        except ValueError as e:
            LOGGER.error('File ID %s is not an integer', fid)
            LOGGER.exception(e)
            raise

        #File IDs are *CHANGEABLE* according to Dryad, Dec 2021
        #SQLite default returns are by ROWID ASC, so the last record
        #returned should still be the correct, ie. most recent, one.
        #However, just in case, this is now done explicitly.
        self.cursor.execute('SELECT dvfid, ROWID FROM dvFiles WHERE \
                             dryfid = ? ORDER BY ROWID ASC;', (fid,))
        dvfid = self.cursor.fetchall()
        if dvfid:
            return dvfid[-1][0]
        return None

    def get_dv_fids(self, filelist):
        '''
        Returns Dataverse file IDs from a list of Dryad file tuples.
        Generally, you would use the output from
        dryad2dataverse.monitor.Monitor.diff_files['delete']
        to discover Dataverse file ids for deletion.

        Parameters
        ----------
        filelist : list
            List of Dryad file tuples: eg:

            ```
            [('https://datadryad.org/api/v2/files/385819/download',
              'GCB_ACG_Mortality_2020.zip',
              'application/x-zip-compressed', 23787587),
             ('https://datadryad.org/api/v2/files/385820/download',
             'Readme_ACG_Mortality.txt',
             'text/plain', 1350)]
             ```
        '''
        fids = []
        for f in filelist:
            fids.append(self.get_dv_fid(f[0]))
        return fids
        # return [self.get_dv_fid(f[0]) for f in filelist]

    def get_json_dvfids(self, serial)->list:
        '''
        Return a list of Dataverse file ids for Dryad JSONs which were
        uploaded to Dataverse.
        Normally used to discover the file IDs to remove Dryad JSONs
        which have changed.

        Parameters
        ----------
        serial : dryad2dataverse.serializer.Serializer

        Returns
        -------
        list
        '''
        self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi=?',
                            (serial.doi,))
        try:
            uid = self.cursor.fetchone()[0]
            self.cursor.execute('SELECT dvfid FROM dvFiles WHERE \
                                 dryaduid = ? AND dryfid=?', (uid, 0))
            jsonfid = [f[0] for f in self.cursor.fetchall()]
            return jsonfid

        except TypeError:
            return []

    def update(self, transfer):
        '''
        Updates the Monitor database with information from a
        dryad2dataverse.transfer.Transfer instance.

        If a Dryad primary metadata record has changes, it will be
        deleted from the database.

        This method should be called after all transfers are completed,
        including Dryad JSON updates, as the last action for transfer.

        Parameters
        ----------
        transfer : dryad2dataverse.transfer.Transfer
        '''
        # get the pre-update dryad uid in case we need it.
        self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi = ?',
                            (transfer.dryad.dryadJson['identifier'],))
        olduid = self.cursor.fetchone()[0]
        if olduid:
            olduid = int(olduid)
        if self.status(transfer.dryad)['status'] != 'unchanged':
            doi = transfer.doi
            lastmod = transfer.dryad.dryadJson.get('lastModificationDate')
            dryadJson = json.dumps(transfer.dryad.dryadJson)
            dvJson = json.dumps(transfer.dvStudy)

            # Update study metadata
            self.cursor.execute('INSERT INTO dryadStudy \
                                 (doi, lastmoddate, dryadjson, dvjson) \
                                 VALUES (?, ?, ?, ?)',
                                (doi, lastmod, dryadJson, dvJson))
            self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE \
                                 doi = ?', (doi,))
            dryaduid = self.cursor.fetchone()[0]
            #if type(dryaduid) != int:
            if not isinstance(dryaduid, int):
                try:
                    raise TypeError('Dryad UID is not an integer')
                except TypeError as e:
                    LOGGER.error(e)
                    raise

            # Update dryad file json
            self.cursor.execute('INSERT INTO dryadFiles VALUES (?, ?)',
                                (dryaduid,
                                 json.dumps(transfer.dryad.fileJson)))
            # Update dataverse study map
            self.cursor.execute('SELECT dvpid FROM dvStudy WHERE \
                                 dvpid = ?', (transfer.dryad.dvpid,))
            if not self.cursor.fetchone():
                self.cursor.execute('INSERT INTO dvStudy VALUES (?, ?)',
                                    (dryaduid, transfer.dryad.dvpid))
            else:
                self.cursor.execute('UPDATE dvStudy SET dryaduid=?, \
                                     dvpid=? WHERE dvpid =?',
                                    (dryaduid, transfer.dryad.dvpid,
                                     transfer.dryad.dvpid))

            # Update the files table
            # Because we want to have a *complete* file list for each
            # dryaduid, we have to copy any existing old files,
            # then add and delete.
            if olduid:
                self.cursor.execute('SELECT * FROM dvFiles WHERE \
                                     dryaduid=?', (olduid,))
                inserter = self.cursor.fetchall()
                for rec in inserter:
                    # TODONE FIX THIS #I think it's fixed 11 Feb 21
                    self.cursor.execute('INSERT INTO dvFiles VALUES \
                                         (?, ?, ?, ?, ?, ?)',
                                        (dryaduid, rec[1], rec[2],
                                         rec[3], rec[4], rec[5]))
            # insert newly uploaded files
            for rec in transfer.fileUpRecord:
                try:
                    dvfid = rec[1]['data']['files'][0]['dataFile']['id']
                    # Screw you for burying the file ID this deep
                    recMd5 = rec[1]['data']['files'][0]['dataFile']['checksum']['value']
                except (KeyError, IndexError) as err:
                    #write to failed uploads table instead
                    status = rec[1].get('status')
                    if not status:
                        LOGGER.error('JSON read error for Dryad file ID %s', rec[0])
                        LOGGER.error('File %s for DOI %s may not be uploaded', rec[0], transfer.doi)
                        LOGGER.exception(err)
                        msg = {'status': 'Failure: Other non-specific '
                                         'failure. Check logs'}

                        self.cursor.execute('INSERT INTO failed_uploads VALUES \
                                        (?, ?, ?);', (dryaduid, rec[0], json.dumps(msg)))
                        continue
                    self.cursor.execute('INSERT INTO failed_uploads VALUES \
                                        (?, ?, ?);', (dryaduid, rec[0], json.dumps(rec[1])))
                    LOGGER.warning(type(err))
                    LOGGER.warning('%s. DOI %s, File ID %s',
                                   rec[1].get('status'),
                                   transfer.doi, rec[0])
                    continue
                # md5s verified during upload step, so they should
                # match already
                self.cursor.execute('INSERT INTO dvFiles VALUES \
                                     (?, ?, ?, ?, ?, ?)',
                                    (dryaduid, rec[0], recMd5,
                                     dvfid, recMd5, json.dumps(rec[1])))

            # Now the deleted files
            for rec in transfer.fileDelRecord:
                # fileDelRecord consists only of [fid,fid2, ...]
                # Dryad record ID is int not str
                self.cursor.execute('DELETE FROM dvFiles WHERE dvfid=? \
                                     AND dryaduid=?',
                                    (int(rec), dryaduid))
                LOGGER.debug('deleted dryfid = %s, dryaduid = %s', rec, dryaduid)

            # And lastly, any JSON metadata updates:
            # NOW WHAT?
            # JSON has dryfid==0
            self.cursor.execute('SELECT * FROM dvfiles WHERE \
                                 dryfid=? and dryaduid=?',
                                (0, dryaduid))
            try:
                exists = self.cursor.fetchone()[0]
                # Old metadata must be deleted on a change.
                if exists:
                    shouldDel = self.status(transfer.dryad)['status']
                    if shouldDel == 'updated':
                        self.cursor.execute('DELETE FROM dvfiles WHERE \
                                             dryfid=? and dryaduid=?',
                                            (0, dryaduid))
            except TypeError:
                pass

            if transfer.jsonFlag:
                # update dryad JSON
                djson5 = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['checksum']['value']
                dfid = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['id']
                self.cursor.execute('INSERT INTO dvfiles VALUES \
                                     (?, ?, ?, ?, ?, ?)',
                                    (dryaduid, 0, djson5, dfid,
                                     djson5, json.dumps(transfer.jsonFlag[1])))

        self.conn.commit()

    def set_timestamp(self, curdate=None):
        '''
        Adds current time to the database table. Can be queried and be used
        for subsequent checking for updates. To query last modification time,
        use the dataverse2dryad.monitor.Monitor.lastmod attribute.

        Parameters
        ----------
        curdate : str
            UTC datetime string in the format suitable for the Dryad API.
            eg. 2021-01-21T21:42:40Z
               or .strftime('%Y-%m-%dT%H:%M:%SZ').
        '''
        #Dryad API uses Zulu time
        if not curdate:
            curdate = datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
        self.cursor.execute('INSERT INTO lastcheck VALUES (?)',
                            (curdate,))
        self.conn.commit()

lastmod property

Returns last modification date from monitor.dbase.

__del__()

Commits all database transactions on object deletion and closes database.

Source code in src/dryad2dataverse/monitor.py
def __del__(self):
    '''
    Commits all database transactions on object deletion and closes database.
    '''
    self.conn.commit()
    self.conn.close()

__init__(dbase=None, *args, **kwargs)

Initialize the Monitor instance if not instantiated already (ie, Monitor is a singleton).

Parameters:
  • dbase (str, default: dryad2datverse.constants.DBASE ) –

    Complete path to desired location of tracking database (eg: /tmp/test.db).

  • *args (list, default: () ) –
  • **kwargs (dict, default: {} ) –
Source code in src/dryad2dataverse/monitor.py
def __init__(self, dbase=None, *args, **kwargs):
    # remove args and kwargs when you find out how init interacts with new.
    '''
    Initialize the Monitor instance if not instantiated already (ie, Monitor
    is a singleton).

    Parameters
    ----------
    dbase : str, default=dryad2datverse.constants.DBASE
        Complete path to desired location of tracking database
        (eg: /tmp/test.db).
    *args : list
    **kwargs : dict
    '''
    if self.__initialized:
        return
    self.__initialized = True
    if not dbase:
        self.dbase = constants.DBASE
    else:
        self.dbase = dbase

__new__(dbase=None, *args, **kwargs)

Creates a new singleton instance of Monitor.

Also creates a database if existing database is not present.

Parameters:
  • dbase (str, default: None ) –

    Path to sqlite3 database. That is: /path/to/file.sqlite3

  • *args (list, default: () ) –
  • **kwargs (dict, default: {} ) –
Source code in src/dryad2dataverse/monitor.py
def __new__(cls, dbase=None, *args, **kwargs):
    '''
    Creates a new singleton instance of Monitor.

    Also creates a database if existing database is not present.

    Parameters
    ----------
    dbase : str
        Path to sqlite3 database. That is:
        /path/to/file.sqlite3

    *args : list
    **kwargs : dict
    '''
    if cls.__instance is None:
        cls.__instance = super(Monitor, cls).__new__(cls)
        cls.__instance.__initialized = False
        cls.dbase = dbase
        if not cls.dbase:
            cls.dbase = constants.DBASE
        cls.conn = sqlite3.Connection(cls.dbase)
        cls.cursor = cls.conn.cursor()
        create = ['CREATE TABLE IF NOT EXISTS dryadStudy \
                   (uid INTEGER PRIMARY KEY AUTOINCREMENT, \
                   doi TEXT, lastmoddate TEXT, dryadjson TEXT, \
                   dvjson TEXT);',
                   'CREATE TABLE IF NOT EXISTS dryadFiles \
                   (dryaduid INTEGER REFERENCES dryadStudy (uid), \
                   dryfilesjson TEXT);',
                   'CREATE TABLE IF NOT EXISTS dvStudy \
                   (dryaduid INTEGER references dryadStudy (uid), \
                   dvpid TEXT);',
                   'CREATE TABLE IF NOT EXISTS dvFiles \
                   (dryaduid INTEGER references dryadStudy (uid), \
                   dryfid INT, \
                   drymd5 TEXT, dvfid TEXT, dvmd5 TEXT, \
                   dvfilejson TEXT);',
                   'CREATE TABLE IF NOT EXISTS lastcheck \
                   (checkdate TEXT);',
                   'CREATE TABLE IF NOT EXISTS failed_uploads \
                   (dryaduid INTEGER references dryadstudy (uid), \
                   dryfid INT, status TEXT);'
                  ]

        for line in create:
            cls.cursor.execute(line)
        cls.conn.commit()
        LOGGER.info('Using database %s', cls.dbase)

    return cls.__instance

diff_files(serial)

Returns a dict with additions and deletions from previous Dryad to dataverse upload.

Because checksums are not necessarily included in Dryad file metadata, this method uses dryad file IDs, size, or whatever is available.

If dryad2dataverse.monitor.Monitor.status() indicates a change it will produce dictionary output with a list of additions, deletions or hash changes (ie, identical except for hash changes), as below:

{'add':[dyadfiletuples], 'delete:[dryadfiletuples], 'hash_change': [dryadfiletuples]}

Parameters:
Source code in src/dryad2dataverse/monitor.py
def diff_files(self, serial):
    '''
    Returns a dict with additions and deletions from previous Dryad
    to dataverse upload.

    Because checksums are not necessarily included in Dryad file
    metadata, this method uses dryad file IDs, size, or
    whatever is available.

    If dryad2dataverse.monitor.Monitor.status()
    indicates a change it will produce dictionary output with a list
    of additions, deletions or hash changes (ie, identical
    except for hash changes), as below:

    `{'add':[dyadfiletuples], 'delete:[dryadfiletuples],
      'hash_change': [dryadfiletuples]}`

    Parameters
    ----------
    serial : dryad2dataverse.serializer.Serializer
    '''
    diffReport = {}
    if self.status(serial)['status'] == 'new':
        #do we want to show what needs to be added?
        return {'add': serial.files}
        #return {}
    self.cursor.execute('SELECT uid from dryadStudy WHERE doi = ?',
                        (serial.doi,))
    mostRecent = self.cursor.fetchall()[-1][0]
    self.cursor.execute('SELECT dryfilesjson from dryadFiles WHERE \
                         dryaduid = ?', (mostRecent,))
    oldFileList = self.cursor.fetchall()[-1][0]
    if not oldFileList:
        oldFileList = []
    else:
        out = []
        #With Dryad API change, files are paginated
        #now stored as list
        for old in json.loads(oldFileList):
        #for old in oldFileList:
            oldFiles = old['_embedded'].get('stash:files')
            # comparing file tuples from dryad2dataverse.serializer.
            # Maybe JSON is better?
            # because of code duplication below.
            for f in oldFiles:
                #Download links are not persistent. Be warned
                try:
                    downLink = f['_links']['stash:file-download']['href']
                except KeyError:
                    downLink = f['_links']['stash:download']['href']
                downLink = f'{constants.DRYURL}{downLink}'
                name = f['path']
                mimeType = f['mimeType']
                size = f['size']
                descr = f.get('description', '')
                digestType = f.get('digestType', '')
                digest = f.get('digest', '')
                out.append((downLink, name, mimeType, size, descr, digestType, digest))
            oldFiles = out
    newFiles = serial.files[:]
    # Tests go here
    #Check for identity first
    #if returned here there are definitely no changes
    if (set(oldFiles).issuperset(set(newFiles)) and
            set(newFiles).issuperset(oldFiles)):
        return diffReport
    #filenames for checking hash changes.
    #Can't use URL or hashes for comparisons because they can change
    #without warning, despite the fact that the API says that
    #file IDs are unique. They aren't. Verified by Ryan Scherle at
    #Dryad December 2021
    old_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(oldFiles)}
    new_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(newFiles)}
    old_no_hash = [old_map[x]['no_hash'] for x in old_map]
    new_no_hash = [new_map[x]['no_hash'] for x in new_map]

    #check for added hash only
    hash_change = Monitor.__added_hashes(oldFiles, newFiles)

    must = set(old_no_hash).issuperset(set(new_no_hash))
    if not must:
        needsadd = set(new_no_hash) - (set(old_no_hash) & set(new_no_hash))
        #Use the map created above to return the full file info
        diffReport.update({'add': [new_map[new_no_hash.index(x)]['orig']
                                   for x in needsadd]})
    must = set(new_no_hash).issuperset(old_no_hash)
    if not must:
        needsdel = set(old_no_hash) - (set(new_no_hash) & set(old_no_hash))
        diffReport.update({'delete' : [old_map[old_no_hash.index(x)]['orig']
                                       for x in needsdel]})
    if hash_change:
        diffReport.update({'hash_change': hash_change})
    return diffReport

diff_metadata(serial)

Analyzes differences in metadata between current serializer instance and last updated serializer instance.

Parameters:
Returns:
  • Returns a list of field changes consisting of:
  • [{key: (old_value, new_value}] or None if no changes.
Notes

For example:

[{'title':
('Cascading effects of algal warming in a freshwater community',
 'Cascading effects of algal warming in a freshwater community theatre')}
]
Source code in src/dryad2dataverse/monitor.py
def diff_metadata(self, serial):
    '''
    Analyzes differences in metadata between current serializer
    instance and last updated serializer instance.

    Parameters
    ----------
    serial : dryad2dataverse.serializer.Serializer

    Returns
    -------
    Returns a list of field changes consisting of:
    [{key: (old_value, new_value}] or None if no changes.

    Notes
    -----
    For example:
    ```
    [{'title':
    ('Cascading effects of algal warming in a freshwater community',
     'Cascading effects of algal warming in a freshwater community theatre')}
    ]
    ```
    '''
    if self.status(serial)['status'] == 'updated':
        self.cursor.execute('SELECT dryadjson from dryadStudy \
                             WHERE doi = ?',
                            (serial.dryadJson['identifier'],))
        oldJson = json.loads(self.cursor.fetchall()[-1][0])
        out = []
        for k in serial.dryadJson:
            if serial.dryadJson[k] != oldJson.get(k):
                out.append({k: (oldJson.get(k), serial.dryadJson[k])})
        return out

    return None

get_dv_fid(url)

Returns str — the Dataverse file ID from parsing a Dryad file download link. Normally used for determining dataverse file ids for deletion in case of dryad file changes.

Parameters:
  • url (str) –

    Dryad file URL in form of ‘https://datadryad.org/api/v2/files/385819/download’.

Source code in src/dryad2dataverse/monitor.py
def get_dv_fid(self, url):
    '''
    Returns str — the Dataverse file ID from parsing a Dryad
    file download link.  Normally used for determining dataverse
    file ids for *deletion* in case of dryad file changes.

    Parameters
    ----------
    url : str
        *Dryad* file URL in form of
        'https://datadryad.org/api/v2/files/385819/download'.
    '''
    fid = url[url.rfind('/', 0, -10)+1:].strip('/download')
    try:
        fid = int(fid)
    except ValueError as e:
        LOGGER.error('File ID %s is not an integer', fid)
        LOGGER.exception(e)
        raise

    #File IDs are *CHANGEABLE* according to Dryad, Dec 2021
    #SQLite default returns are by ROWID ASC, so the last record
    #returned should still be the correct, ie. most recent, one.
    #However, just in case, this is now done explicitly.
    self.cursor.execute('SELECT dvfid, ROWID FROM dvFiles WHERE \
                         dryfid = ? ORDER BY ROWID ASC;', (fid,))
    dvfid = self.cursor.fetchall()
    if dvfid:
        return dvfid[-1][0]
    return None

get_dv_fids(filelist)

Returns Dataverse file IDs from a list of Dryad file tuples. Generally, you would use the output from dryad2dataverse.monitor.Monitor.diff_files[‘delete’] to discover Dataverse file ids for deletion.

Parameters:
  • filelist (list) –

    List of Dryad file tuples: eg:

    [('https://datadryad.org/api/v2/files/385819/download', 'GCB_ACG_Mortality_2020.zip', 'application/x-zip-compressed', 23787587), ('https://datadryad.org/api/v2/files/385820/download', 'Readme_ACG_Mortality.txt', 'text/plain', 1350)]

Source code in src/dryad2dataverse/monitor.py
def get_dv_fids(self, filelist):
    '''
    Returns Dataverse file IDs from a list of Dryad file tuples.
    Generally, you would use the output from
    dryad2dataverse.monitor.Monitor.diff_files['delete']
    to discover Dataverse file ids for deletion.

    Parameters
    ----------
    filelist : list
        List of Dryad file tuples: eg:

        ```
        [('https://datadryad.org/api/v2/files/385819/download',
          'GCB_ACG_Mortality_2020.zip',
          'application/x-zip-compressed', 23787587),
         ('https://datadryad.org/api/v2/files/385820/download',
         'Readme_ACG_Mortality.txt',
         'text/plain', 1350)]
         ```
    '''
    fids = []
    for f in filelist:
        fids.append(self.get_dv_fid(f[0]))
    return fids

get_json_dvfids(serial)

Return a list of Dataverse file ids for Dryad JSONs which were uploaded to Dataverse. Normally used to discover the file IDs to remove Dryad JSONs which have changed.

Parameters:
Returns:
  • list
Source code in src/dryad2dataverse/monitor.py
def get_json_dvfids(self, serial)->list:
    '''
    Return a list of Dataverse file ids for Dryad JSONs which were
    uploaded to Dataverse.
    Normally used to discover the file IDs to remove Dryad JSONs
    which have changed.

    Parameters
    ----------
    serial : dryad2dataverse.serializer.Serializer

    Returns
    -------
    list
    '''
    self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi=?',
                        (serial.doi,))
    try:
        uid = self.cursor.fetchone()[0]
        self.cursor.execute('SELECT dvfid FROM dvFiles WHERE \
                             dryaduid = ? AND dryfid=?', (uid, 0))
        jsonfid = [f[0] for f in self.cursor.fetchall()]
        return jsonfid

    except TypeError:
        return []

set_timestamp(curdate=None)

Adds current time to the database table. Can be queried and be used for subsequent checking for updates. To query last modification time, use the dataverse2dryad.monitor.Monitor.lastmod attribute.

Parameters:
  • curdate (str, default: None ) –

    UTC datetime string in the format suitable for the Dryad API. eg. 2021-01-21T21:42:40Z or .strftime(‘%Y-%m-%dT%H:%M:%SZ’).

Source code in src/dryad2dataverse/monitor.py
def set_timestamp(self, curdate=None):
    '''
    Adds current time to the database table. Can be queried and be used
    for subsequent checking for updates. To query last modification time,
    use the dataverse2dryad.monitor.Monitor.lastmod attribute.

    Parameters
    ----------
    curdate : str
        UTC datetime string in the format suitable for the Dryad API.
        eg. 2021-01-21T21:42:40Z
           or .strftime('%Y-%m-%dT%H:%M:%SZ').
    '''
    #Dryad API uses Zulu time
    if not curdate:
        curdate = datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
    self.cursor.execute('INSERT INTO lastcheck VALUES (?)',
                        (curdate,))
    self.conn.commit()

status(serial)

Returns a dictionary with keys ‘status’ and ‘dvpid’ and ‘notes’.

Parameters:
  • serial ( dryad2dataverse.serializer.Serializer) –
Returns:
  • `{status :'updated', 'dvpid':'doi://some/ident'}`.
Notes

status is one of ‘new’, ‘identical’, ‘lastmodsame’, ‘updated’

‘new’ is a completely new file.

‘identical’ The metadata from Dryad is identical to the last time the check was run.

‘lastmodsame’ Dryad lastModificationDate == last modification date in database AND output JSON is different. This can indicate a Dryad API output change, reindexing or something else. But the lastModificationDate is supposed to be an indicator of meaningful change, so this option exists so you can decide what to do given this option

‘updated’ Indicates changes to lastModificationDate

Note that Dryad constantly changes their API output, so the changes may not actually be meaningful.

dvpid is a Dataverse persistent identifier. None in the case of status=’new’

notes: value of Dryad versionChanges field. One of files_changed or metatdata_changed. Non-null value present only when status is not new or identical. Note that Dryad has no way to indicate both a file and metadata change, so this value reflects only the last change in the Dryad state.

Source code in src/dryad2dataverse/monitor.py
def status(self, serial)->dict:
    '''
    Returns a dictionary with keys 'status' and 'dvpid' and 'notes'.

    Parameters
    ----------
    serial :  dryad2dataverse.serializer.Serializer

    Returns
    -------
    `{status :'updated', 'dvpid':'doi://some/ident'}`.

    Notes
    ------
    `status` is one of 'new', 'identical',  'lastmodsame',
    'updated'

    'new' is a completely new file.

    'identical' The metadata from Dryad is *identical* to the last time
    the check was run.

    'lastmodsame' Dryad lastModificationDate ==  last modification date
    in database AND output JSON is different.
    This can indicate a Dryad
    API output change, reindexing or something else.
    But the lastModificationDate
    is supposed to be an indicator of meaningful change, so this option
    exists so you can decide what to do given this option

    'updated' Indicates changes to lastModificationDate

    Note that Dryad constantly changes their API output, so the changes
    may not actually be meaningful.

    `dvpid` is a Dataverse persistent identifier.
    `None` in the case of status='new'

    `notes`: value of Dryad versionChanges field. One of `files_changed` or
    `metatdata_changed`. Non-null value present only when status is
    not `new` or `identical`. Note that Dryad has no way to indicate *both*
    a file and metadata change, so this value reflects only the *last* change
    in the Dryad state.
    '''
    # Last mod date is indicator of change.
    # From email w/Ryan Scherle 10 Nov 2020
    #The versionNumber updates for either a metadata change or a
    #file change. Although we save all of these changes internally, our web
    #interface only displays the versions that have file changes, along
    #with the most recent metadata. So a dataset that has only two versions
    #of files listed on the web may actually have several more versions in
    #the API.
    #
    #If your only need is to track when there are changes to a
    #dataset, you may want to use the `lastModificationDate`, which we have
    #recently added to our metadata.
    #
    #Note that the Dryad API output ISN'T STABLE; they add fields etc.
    #This means that a comparison of JSON may yield differences even though
    #metadata is technically "the same". Just comparing two dicts doesn't cut
    #it.
    #############################
    ## Note: by inspection, Dryad outputs JSON that is different
    ## EVEN IF lastModificationDate is unchanged. (14 January 2022)
    ## So now what?
    #############################
    doi = serial.dryadJson['identifier']
    self.cursor.execute('SELECT * FROM dryadStudy WHERE doi = ?',
                        (doi,))
    result = self.cursor.fetchall()

    if not result:
        return {'status': 'new', 'dvpid': None, 'notes': ''}
    # dvjson = json.loads(result[-1][4])
    # Check the fresh vs. updated jsons for the keys
    try:
        dryaduid = result[-1][0]
        self.cursor.execute('SELECT dvpid from dvStudy WHERE \
                             dryaduid = ?', (dryaduid,))
        dvpid = self.cursor.fetchall()[-1][0]
        serial.dvpid = dvpid
    except TypeError:
        try:
            raise exceptions.DatabaseError
        except exceptions.DatabaseError as e:
            LOGGER.error('Dryad DOI : %s. Error finding Dataverse PID', doi)
            LOGGER.exception(e)
            raise
    newfile = copy.deepcopy(serial.dryadJson)
    testfile = copy.deepcopy(json.loads(result[-1][3]))
    if newfile == testfile:
        return {'status': 'identical', 'dvpid': dvpid, 'notes': ''}
    if newfile['lastModificationDate'] != testfile['lastModificationDate']:
        return {'status': 'updated', 'dvpid': dvpid,
                'notes': newfile['versionChanges']}
    return {'status': 'lastmodsame', 'dvpid': dvpid,
                 'notes': newfile.get('versionChanges')}

update(transfer)

Updates the Monitor database with information from a dryad2dataverse.transfer.Transfer instance.

If a Dryad primary metadata record has changes, it will be deleted from the database.

This method should be called after all transfers are completed, including Dryad JSON updates, as the last action for transfer.

Parameters:
Source code in src/dryad2dataverse/monitor.py
def update(self, transfer):
    '''
    Updates the Monitor database with information from a
    dryad2dataverse.transfer.Transfer instance.

    If a Dryad primary metadata record has changes, it will be
    deleted from the database.

    This method should be called after all transfers are completed,
    including Dryad JSON updates, as the last action for transfer.

    Parameters
    ----------
    transfer : dryad2dataverse.transfer.Transfer
    '''
    # get the pre-update dryad uid in case we need it.
    self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi = ?',
                        (transfer.dryad.dryadJson['identifier'],))
    olduid = self.cursor.fetchone()[0]
    if olduid:
        olduid = int(olduid)
    if self.status(transfer.dryad)['status'] != 'unchanged':
        doi = transfer.doi
        lastmod = transfer.dryad.dryadJson.get('lastModificationDate')
        dryadJson = json.dumps(transfer.dryad.dryadJson)
        dvJson = json.dumps(transfer.dvStudy)

        # Update study metadata
        self.cursor.execute('INSERT INTO dryadStudy \
                             (doi, lastmoddate, dryadjson, dvjson) \
                             VALUES (?, ?, ?, ?)',
                            (doi, lastmod, dryadJson, dvJson))
        self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE \
                             doi = ?', (doi,))
        dryaduid = self.cursor.fetchone()[0]
        #if type(dryaduid) != int:
        if not isinstance(dryaduid, int):
            try:
                raise TypeError('Dryad UID is not an integer')
            except TypeError as e:
                LOGGER.error(e)
                raise

        # Update dryad file json
        self.cursor.execute('INSERT INTO dryadFiles VALUES (?, ?)',
                            (dryaduid,
                             json.dumps(transfer.dryad.fileJson)))
        # Update dataverse study map
        self.cursor.execute('SELECT dvpid FROM dvStudy WHERE \
                             dvpid = ?', (transfer.dryad.dvpid,))
        if not self.cursor.fetchone():
            self.cursor.execute('INSERT INTO dvStudy VALUES (?, ?)',
                                (dryaduid, transfer.dryad.dvpid))
        else:
            self.cursor.execute('UPDATE dvStudy SET dryaduid=?, \
                                 dvpid=? WHERE dvpid =?',
                                (dryaduid, transfer.dryad.dvpid,
                                 transfer.dryad.dvpid))

        # Update the files table
        # Because we want to have a *complete* file list for each
        # dryaduid, we have to copy any existing old files,
        # then add and delete.
        if olduid:
            self.cursor.execute('SELECT * FROM dvFiles WHERE \
                                 dryaduid=?', (olduid,))
            inserter = self.cursor.fetchall()
            for rec in inserter:
                # TODONE FIX THIS #I think it's fixed 11 Feb 21
                self.cursor.execute('INSERT INTO dvFiles VALUES \
                                     (?, ?, ?, ?, ?, ?)',
                                    (dryaduid, rec[1], rec[2],
                                     rec[3], rec[4], rec[5]))
        # insert newly uploaded files
        for rec in transfer.fileUpRecord:
            try:
                dvfid = rec[1]['data']['files'][0]['dataFile']['id']
                # Screw you for burying the file ID this deep
                recMd5 = rec[1]['data']['files'][0]['dataFile']['checksum']['value']
            except (KeyError, IndexError) as err:
                #write to failed uploads table instead
                status = rec[1].get('status')
                if not status:
                    LOGGER.error('JSON read error for Dryad file ID %s', rec[0])
                    LOGGER.error('File %s for DOI %s may not be uploaded', rec[0], transfer.doi)
                    LOGGER.exception(err)
                    msg = {'status': 'Failure: Other non-specific '
                                     'failure. Check logs'}

                    self.cursor.execute('INSERT INTO failed_uploads VALUES \
                                    (?, ?, ?);', (dryaduid, rec[0], json.dumps(msg)))
                    continue
                self.cursor.execute('INSERT INTO failed_uploads VALUES \
                                    (?, ?, ?);', (dryaduid, rec[0], json.dumps(rec[1])))
                LOGGER.warning(type(err))
                LOGGER.warning('%s. DOI %s, File ID %s',
                               rec[1].get('status'),
                               transfer.doi, rec[0])
                continue
            # md5s verified during upload step, so they should
            # match already
            self.cursor.execute('INSERT INTO dvFiles VALUES \
                                 (?, ?, ?, ?, ?, ?)',
                                (dryaduid, rec[0], recMd5,
                                 dvfid, recMd5, json.dumps(rec[1])))

        # Now the deleted files
        for rec in transfer.fileDelRecord:
            # fileDelRecord consists only of [fid,fid2, ...]
            # Dryad record ID is int not str
            self.cursor.execute('DELETE FROM dvFiles WHERE dvfid=? \
                                 AND dryaduid=?',
                                (int(rec), dryaduid))
            LOGGER.debug('deleted dryfid = %s, dryaduid = %s', rec, dryaduid)

        # And lastly, any JSON metadata updates:
        # NOW WHAT?
        # JSON has dryfid==0
        self.cursor.execute('SELECT * FROM dvfiles WHERE \
                             dryfid=? and dryaduid=?',
                            (0, dryaduid))
        try:
            exists = self.cursor.fetchone()[0]
            # Old metadata must be deleted on a change.
            if exists:
                shouldDel = self.status(transfer.dryad)['status']
                if shouldDel == 'updated':
                    self.cursor.execute('DELETE FROM dvfiles WHERE \
                                         dryfid=? and dryaduid=?',
                                        (0, dryaduid))
        except TypeError:
            pass

        if transfer.jsonFlag:
            # update dryad JSON
            djson5 = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['checksum']['value']
            dfid = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['id']
            self.cursor.execute('INSERT INTO dvfiles VALUES \
                                 (?, ?, ?, ?, ?, ?)',
                                (dryaduid, 0, djson5, dfid,
                                 djson5, json.dumps(transfer.jsonFlag[1])))

    self.conn.commit()

dryad2dataverse.handlers

Custom log handlers for sending log information to recipients.

SSLSMTPHandler

Bases: SMTPHandler

An SSL handler for logging.handlers

Source code in src/dryad2dataverse/handlers.py
class SSLSMTPHandler(SMTPHandler):
    '''
    An SSL handler for logging.handlers
    '''
    def emit(self, record:logging.LogRecord):
        '''
        Emit a record while using an SSL mail server.

        Parameters
        ----------
        record : logging.LogRecord
        '''
        #Praise be to
        #https://stackoverflow.com/questions/36937461/
        #how-can-i-send-an-email-using-python-loggings-
        #smtphandler-and-ssl
        try:
            port = self.mailport
            if not port:
                port = smtplib.SMTP_PORT
            smtp = smtplib.SMTP_SSL(self.mailhost, port)
            msg = self.format(record)
            out = EmailMessage()
            out['Subject'] = self.getSubject(record)
            out['From'] = self.fromaddr
            out['To'] = self.toaddrs
            out.set_content(msg)
            #global rec2
            #rec2 = record
            if self.username:
                smtp.login(self.username, self.password)
            #smtp.sendmail(self.fromaddr, self.toaddrs, msg)
            #Attempting to send using smtp.sendmail as above
            #results in messages with no text, so use
            smtp.send_message(out)
            smtp.quit()
        except (KeyboardInterrupt, SystemExit):
            raise
        except: # pylint: disable=bare-except
            self.handleError(record)

emit(record)

Emit a record while using an SSL mail server.

Parameters:
  • record (LogRecord) –
Source code in src/dryad2dataverse/handlers.py
def emit(self, record:logging.LogRecord):
    '''
    Emit a record while using an SSL mail server.

    Parameters
    ----------
    record : logging.LogRecord
    '''
    #Praise be to
    #https://stackoverflow.com/questions/36937461/
    #how-can-i-send-an-email-using-python-loggings-
    #smtphandler-and-ssl
    try:
        port = self.mailport
        if not port:
            port = smtplib.SMTP_PORT
        smtp = smtplib.SMTP_SSL(self.mailhost, port)
        msg = self.format(record)
        out = EmailMessage()
        out['Subject'] = self.getSubject(record)
        out['From'] = self.fromaddr
        out['To'] = self.toaddrs
        out.set_content(msg)
        #global rec2
        #rec2 = record
        if self.username:
            smtp.login(self.username, self.password)
        #smtp.sendmail(self.fromaddr, self.toaddrs, msg)
        #Attempting to send using smtp.sendmail as above
        #results in messages with no text, so use
        smtp.send_message(out)
        smtp.quit()
    except (KeyboardInterrupt, SystemExit):
        raise
    except: # pylint: disable=bare-except
        self.handleError(record)

dryad2dataverse.exceptions

Custom exceptions for error handling.

DatabaseError

Bases: Dryad2DataverseError

Tracking database error.

Source code in src/dryad2dataverse/exceptions.py
class DatabaseError(Dryad2DataverseError):
    '''
    Tracking database error.
    '''

DataverseBadApiKeyError

Bases: Dryad2DataverseError

Returned on not OK respose (ie, request.request.json()[‘message’] == ‘Bad api key ‘).

Source code in src/dryad2dataverse/exceptions.py
class DataverseBadApiKeyError(Dryad2DataverseError):
    '''
    Returned on not OK respose (ie, request.request.json()['message'] == 'Bad api key ').
    '''

DataverseDownloadError

Bases: Dryad2DataverseError

Returned on not OK respose (ie, not requests.status_code == 200).

Source code in src/dryad2dataverse/exceptions.py
class DataverseDownloadError(Dryad2DataverseError):
    '''
    Returned on not OK respose (ie, not requests.status_code == 200).
    '''

DataverseUploadError

Bases: Dryad2DataverseError

Returned on not OK respose (ie, not requests.status_code == 200).

Source code in src/dryad2dataverse/exceptions.py
class DataverseUploadError(Dryad2DataverseError):
    '''
    Returned on not OK respose (ie, not requests.status_code == 200).
    '''

DownloadSizeError

Bases: Dryad2DataverseError

Raised when download sizes don’t match reported Dryad file size.

Source code in src/dryad2dataverse/exceptions.py
class DownloadSizeError(Dryad2DataverseError):
    '''
    Raised when download sizes don't match reported
    Dryad file size.
    '''

Dryad2DataverseError

Bases: Exception

Base exception class for Dryad2Dataverse errors.

Source code in src/dryad2dataverse/exceptions.py
class Dryad2DataverseError(Exception):
    '''
    Base exception class for Dryad2Dataverse errors.
    '''

HashError

Bases: Dryad2DataverseError

Raised on hex digest mismatch.

Source code in src/dryad2dataverse/exceptions.py
class HashError(Dryad2DataverseError):
    '''
    Raised on hex digest mismatch.
    '''

NoTargetError

Bases: Dryad2DataverseError

No dataverse target supplied error.

Source code in src/dryad2dataverse/exceptions.py
class NoTargetError(Dryad2DataverseError):
    '''
    No dataverse target supplied error.
    '''

dryad2dataverse.constants

This module contains the information that configures all the parameters required to transfer data from Dryad to Dataverse.

“Constants” may be a bit strong, but the only constant is the presence of change.