API Reference¶
dryad2dataverse
¶
Dryad to Dataverse utilities. No modules are loaded by default, so
>>> import dryad2dataverse
will work, but will have no effect.
Modules included:
-
dryad2dataverse.constants : “Constants” for all modules. URLs, API keys, etc are all here.
-
dryad2dataverse.serializer : Download and serialize Dryad JSON to Dataverse JSON.
-
dryad2dataverse.transfer : metadata and file transfer utilities.
-
dryad2dataverse.monitor : Monitoring and database tools for maintaining a pipeline to Dataverse without unnecessary downloading and file duplication.
-
dryad2dataverse.exceptions : Custom exceptions.
dryad2dataverse.serializer
¶
Serializes Dryad study JSON to Dataverse JSON, as well as producing associated file information.
Serializer
¶
Serializes Dryad JSON to Dataverse JSON
Source code in src/dryad2dataverse/serializer.py
class Serializer():
'''
Serializes Dryad JSON to Dataverse JSON
'''
CC0='''<p>
<img src="https://licensebuttons.net/p/zero/1.0/88x31.png" title="Creative Commons CC0 1.0 Universal Public Domain Dedication. " style="display:none" onload="this.style.display='inline'" />
<a href="http://creativecommons.org/publicdomain/zero/1.0" title="Creative Commons CC0 1.0 Universal Public Domain Dedication. " target="_blank">CC0 1.0</a>
</p>'''
def __init__(self, doi):
'''
Creates Dryad study metadata instance.
Parameters
----------
doi : str
DOI of Dryad study. Required for downloading.
eg: 'doi:10.5061/dryad.2rbnzs7jp'
'''
self.doi = doi
self._dryadJson = None
self._fileJson = None
self._dvJson = None
#Serializer objects will be assigned a Dataverse study PID
#if dryad2Dataverse.transfer.Transfer() is instantiated
self.dvpid = None
self.session = requests.Session()
self.session.mount('https://',
HTTPAdapter(max_retries=constants.RETRY_STRATEGY))
LOGGER.debug('Creating Serializer instance object')
def fetch_record(self, url=None, timeout=45):
'''
Fetches Dryad study record JSON from Dryad V2 API at
https://datadryad.org/api/v2/datasets/.
Saves to self._dryadJson. Querying Serializer.dryadJson
will call this function automatically.
Parameters
----------
url : str
Dryad instance base URL (eg: 'https://datadryad.org').
timeout : int
Timeout in seconds. Default 45.
'''
if not url:
url = constants.DRYURL
try:
headers = {'accept':'application/json',
'Content-Type':'application/json'}
headers.update(USER_AGENT)
doiClean = urllib.parse.quote(self.doi, safe='')
resp = self.session.get(f'{url}/api/v2/datasets/{doiClean}',
headers=headers, timeout=timeout)
resp.raise_for_status()
self._dryadJson = resp.json()
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.error('URL error for: %s', url)
LOGGER.exception(err)
raise
@property
def id(self):
'''
Returns Dryad unique *database* ID, not the DOI.
Where the original Dryad JSON is dryadJson, it's the integer
trailing portion of:
`self.dryadJson['_links']['stash:version']['href']`
'''
href = self.dryadJson['_links']['stash:version']['href']
index = href.rfind('/') + 1
return int(href[index:])
@property
def dryadJson(self):
'''
Returns Dryad study JSON. Will call Serializer.fetch_record() if
no JSON is present.
'''
if not self._dryadJson:
self.fetch_record()
return self._dryadJson
@dryadJson.setter
def dryadJson(self, value=None):
'''
Fetches Dryad JSON from Dryad website if not supplied.
If supplying it, make sure it's correct or you will run into trouble
with processing later.
Parameters
----------
value : dict
Dryad JSON.
'''
if value:
self._dryadJson = value
else:
self.fetch_record()
@property
def embargo(self)->bool:
'''
Check embargo status. Returns boolean True if embargoed.
'''
if self.dryadJson.get('curationStatus') == 'Embargoed':
return True
return False
@property
def dvJson(self):
'''
Returns Dataverse study JSON as dict.
'''
self._assemble_json()
return self._dvJson
@property
def fileJson(self, timeout=45):
'''
Returns a list of file JSONs from call to Dryad API /files/{id},
where the ID is parsed from the Dryad JSON. Dryad file listings
are paginated, so the return consists of a list of dicts, one
per page.
Parameters
----------
timeout : int
Request timeout in seconds.
'''
if not self._fileJson:
try:
self._fileJson = []
headers = {'accept':'application/json',
'Content-Type':'application/json'}
headers.update(USER_AGENT)
fileList = self.session.get(f'{constants.DRYURL}/api/v2/versions/{self.id}/files',
headers=headers,
timeout=timeout)
fileList.raise_for_status()
#total = fileList.json()['total'] #Not needed
lastPage = fileList.json()['_links']['last']['href']
pages = int(lastPage[lastPage.rfind('=')+1:])
self._fileJson.append(fileList.json())
for i in range(2, pages+1):
fileCont = self.session.get(f'{constants.DRYURL}/api/v2'
f'/versions/{self.id}/files?page={i}',
headers=headers,
timeout=timeout)
fileCont.raise_for_status()
self._fileJson.append(fileCont.json())
except Exception as e:
LOGGER.exception(e)
raise
return self._fileJson
@property
def files(self)->list:
'''
Returns a list of tuples with:
(Download_location, filename, mimetype, size, description,
digest, digestType )
Digest types include, but are not necessarily limited to:
'adler-32','crc-32','md2','md5','sha-1','sha-256',
'sha-384','sha-512'
'''
out = []
for page in self.fileJson:
files = page['_embedded'].get('stash:files')
if files:
for f in files:
#This broke with this commit:
# https://github.com/datadryad/dryad-app/commit/b8a333ba34b14e55cbc1d7ed5aa4451e0f41db66
#downLink = f['_links']['stash:file-download']['href']
downLink = f['_links']['stash:download']['href']
downLink = f'{constants.DRYURL}{downLink}'
name = f['path']
mimeType = f['mimeType']
size = f['size']
#HOW ABOUT PUTTING THIS IN THE DRYAD API PAGE?
descr = f.get('description', '')
digestType = f.get('digestType', '')
#not all files have a digest
digest = f.get('digest', '')
#Does it matter? If the primary use case is to
#compare why not take all the digest types.
#md5 = ''
#if digestType == 'md5' and digest:
# md5 = digest
# #nothing in the docs as to algorithms so just picking md5
# #Email from Ryan Scherle 30 Nov 20: supported digest type
# #('adler-32','crc-32','md2','md5','sha-1','sha-256',
# #'sha-384','sha-512')
out.append((downLink, name, mimeType, size, descr, digestType,
digest))
return out
@property
def oversize(self, maxsize=None):
'''
Returns a list of Dryad files whose size value
exceeds maxsize. Maximum size defaults to
dryad2dataverse.constants.MAX_UPLOAD
Parameters
----------
maxsize : int
Size in bytes in which to flag as oversize.
Defaults to constants.MAX_UPLOAD.
'''
if not maxsize:
maxsize = constants.MAX_UPLOAD
toobig = []
for f in self.files:
if f[3] >= maxsize:
toobig.append(f)
return toobig
#def_typeclass(self, typeName, multiple, typeClass):
@staticmethod
def _typeclass(typeName, multiple, typeClass):
'''
Creates wrapper around single or multiple Dataverse JSON objects.
Returns a dict *without* the Dataverse 'value' key'.
Parameters
----------
typeName : str
Dataverse typeName (eg: 'author').
multiple : boolean
"Multiple" value in Dataverse JSON.
typeClass : str
Dataverse typeClass. Usually one of 'compound', 'primitive,
'controlledVocabulary').
'''
return {'typeName':typeName, 'multiple':multiple,
'typeClass':typeClass}
@staticmethod
def _convert_generic(**kwargs):
'''
Generic dataverse json segment creator of form:
```
{dvField:
{'typeName': dvField,
'value': dryField}
```
Suitable for generalized conversions. Only provides fields with
multiple: False and typeclass:Primitive
Parameters
----------
kwargs : dict
Dict from Dataverse JSON segment
Other parameters
----------------
dvField : str
Dataverse output field
dryField : str
Dryad JSON field to convert
inJson : dict
Dryad JSON **segment** to convert
addJSON : dict (optional)
any other JSON required to complete (cf ISNI)
rType : str
'dict' (default) or 'list'.
Returns 'value' field as dict value or list.
pNotes : str
Notes to be prepended to list type values.
No trailing space required.
'''
dvField = kwargs.get('dvField')
dryField = kwargs.get('dryField')
inJson = kwargs.get('inJson')
addJson = kwargs.get('addJson')
pNotes = kwargs.get('pNotes', '')
rType = kwargs.get('rType', 'dict')
if not dvField or not dryField or not inJson:
try:
raise ValueError('Incorrect or insufficient fields provided')
except ValueError as e:
LOGGER.exception(e)
raise
outfield = inJson.get(dryField)
if outfield:
outfield = outfield.strip()
#if not outfield:
# raise ValueError(f'Dryad field {dryField} not found')
# If value missing can still concat empty dict
if not outfield:
return {}
if rType == 'list':
if pNotes:
outfield = [f'{pNotes} {outfield}']
outJson = {dvField:{'typeName':dvField,
'multiple': False,
'typeClass':'primitive',
'value': outfield}}
#Simple conversion
if not addJson:
return outJson
#Add JSONs together
addJson.update(outJson)
return addJson
@staticmethod
def _convert_author_names(author):
'''
Produces required author json fields.
This is a special case, requiring concatenation of several fields.
Parameters
----------
author : dict
dryad['author'] JSON segment.
'''
first = author.get('firstName')
last = author.get('lastName')
if first + last is None:
return None
authname = f"{author.get('lastName','')}, {author.get('firstName', '')}"
return {'authorName':
{'typeName':'authorName', 'value': authname,
'multiple':False, 'typeClass':'primitive'}}
@staticmethod
def _convert_keywords(*args):
'''
Produces the insane keyword structure Dataverse JSON segment
from a list of words.
Parameters
----------
args : list
List with elements as strings.
Generally input is Dryad JSON 'keywords', ie *Dryad['keywords'].
Don't forget to expand the list using *.
'''
outlist = []
for arg in args:
outlist.append({'keywordValue': {
'typeName':'keywordValue',
'value': arg}})
return outlist
@staticmethod
def _convert_notes(dryJson):
'''
Returns formatted notes field with Dryad JSON values that
don't really fit anywhere into the Dataverse JSON.
Parameters
----------
dryJson : dict
Dryad JSON as dict.
'''
notes = ''
#these fields should be concatenated into notes
notable = ['versionNumber',
'versionStatus',
'manuscriptNumber',
'curationStatus',
'preserveCurationStatus',
'invoiceId',
'sharingLink',
'loosenValidation',
'skipDataciteUpdate',
'storageSize',
'visibility',
'skipEmails']
for note in notable:
text = dryJson.get(note)
if text:
text = str(text).strip()
if note == 'versionNumber':
text = f'<b>Dryad version number:</b> {text}'
if note == 'versionStatus':
text = f'<b>Version status:</b> {text}'
if note == 'manuscriptNumber':
text = f'<b>Manuscript number:</b> {text}'
if note == 'curationStatus':
text = f'<b>Dryad curation status:</b> {text}'
if note == 'preserveCurationStatus':
text = f'<b>Dryad preserve curation status:</b> {text}'
if note == 'invoiceId':
text = f'<b>Invoice ID:</b> {text}'
if note == 'sharingLink':
text = f'<b>Sharing link:</b> {text}'
if note == 'loosenValidation':
text = f'<b>Loosen validation:</b> {text}'
if note == 'skipDataciteUpdate':
text = f'<b>Skip Datacite update:</b> {text}'
if note == 'storageSize':
text = f'<b>Storage size:</b> {text}'
if note == 'visibility':
text = f'<b>Visibility:</b> {text}'
if note == 'skipEmails':
text = f'<b>Skip emails:</b> {text}'
notes += f'<p>{text}</p>\n'
concat = {'typeName':'notesText',
'multiple':False,
'typeClass': 'primitive',
'value': notes}
return concat
@staticmethod
def _boundingbox(north, south, east, west):
'''
Makes a Dataverse bounding box from appropriate coordinates.
Returns Dataverse JSON segment as dict.
Parameters
----------
north : float
south : float
east : float
west : float
Notes
-----
Coordinates in decimal degrees.
'''
names = ['north', 'south', 'east', 'west']
points = [str(x) for x in [north, south, east, west]]
#Because coordinates in DV are strings BFY
coords = [(x[0]+'Longitude', {x[0]:x[1]}) for x in zip(names, points)]
#Yes, everything is longitude in Dataverse
out = []
for coord in coords:
out.append(Serializer._convert_generic(inJson=coord[1],
dvField=coord[0],
#dryField='north'))
dryField=[k for k in coord[1].keys()][0]))
return out
@staticmethod
def _convert_geospatial(dryJson):
'''
Outputs Dataverse geospatial metadata block.
Parameters
----------
dryJson : dict
Dryad json as dict.
'''
if dryJson.get('locations'):
#out = {}
coverage = []
box = []
otherCov = None
gbbox = None
for loc in dryJson.get('locations'):
if loc.get('place'):
#These are impossible to clean. Going to "other" field
other = Serializer._convert_generic(inJson=loc,
dvField='otherGeographicCoverage',
dryField='place')
coverage.append(other)
if loc.get('point'):
#makes size zero bounding box
north = loc['point']['latitude']
south = north
east = loc['point']['longitude']
west = east
point = Serializer._boundingbox(north, south, east, west)
box.append(point)
if loc.get('box'):
north = loc['box']['neLatitude']
south = loc['box']['swLatitude']
east = loc['box']['neLongitude']
west = loc['box']['swLongitude']
area = Serializer._boundingbox(north, south, east, west)
box.append(area)
if coverage:
otherCov = Serializer._typeclass(typeName='geographicCoverage',
multiple=True, typeClass='compound')
otherCov['value'] = coverage
if box:
gbbox = Serializer._typeclass(typeName='geographicCoverage',
multiple=True, typeClass='compound')
gbbox['value'] = box
if otherCov or gbbox:
gblock = {'geospatial': {'displayName' : 'Geospatial Metadata',
'fields': []}}
if otherCov:
gblock['geospatial']['fields'].append(otherCov)
if gbbox:
gblock['geospatial']['fields'].append(gbbox)
return gblock
return {}
def _assemble_json(self, dryJson=None, dvContact=None,
dvEmail=None, defContact=True):
'''
Assembles Dataverse json from Dryad JSON components.
Dataverse JSON is a nightmare, so this function is too.
Parameters
----------
dryJson : dict
Dryad json as dict.
dvContact : str
Default Dataverse contact name.
dvEmail : str
Default Dataverse 4 contact email address.
defContact : boolean
Flag to include default contact information with record.
'''
if not dvContact:
dvContact = constants.DV_CONTACT_NAME
if not dvEmail:
dvEmail = constants.DV_CONTACT_EMAIL
if not dryJson:
dryJson = self.dryadJson
LOGGER.debug(dryJson)
#Licence block changes ensure that it will only work with
#Dataverse v5.10+
#Go back to previous commits to see the earlier "standard"
self._dvJson = {'datasetVersion':
{'license':{'name': 'CC0 1.0',
'uri': 'http://creativecommons.org/publicdomain/zero/1.0' },
'termsOfUse': Serializer.CC0,
'metadataBlocks':{'citation':
{'displayName': 'Citation Metadata',
'fields': []},
}
}
}
#REQUIRED Dataverse fields
#Dryad is a general purpose database; it is hard/impossible to get
#Dataverse required subject tags out of their keywords, so:
defaultSubj = {'typeName' : 'subject',
'typeClass':'controlledVocabulary',
'multiple': True,
'value' : ['Other']}
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(defaultSubj)
reqdTitle = Serializer._convert_generic(inJson=dryJson,
dryField='title',
dvField='title')['title']
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(reqdTitle)
#authors
out = []
for a in dryJson['authors']:
reqdAuthor = Serializer._convert_author_names(a)
if reqdAuthor:
affiliation = Serializer._convert_generic(inJson=a,
dvField='authorAffiliation',
dryField='affiliation')
addOrc = {'authorIdentifierScheme':
{'typeName':'authorIdentifierScheme',
'value': 'ORCID',
'typeClass': 'controlledVocabulary',
'multiple':False}}
#only ORCID at UBC
orcid = Serializer._convert_generic(inJson=a,
dvField='authorIdentifier',
dryField='orcid',
addJson=addOrc)
if affiliation:
reqdAuthor.update(affiliation)
if orcid:
reqdAuthor.update(orcid)
out.append(reqdAuthor)
authors = Serializer._typeclass(typeName='author',
multiple=True, typeClass='compound')
authors['value'] = out
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(authors)
##rewrite as function:contact
out = []
for e in dryJson['authors']:
reqdContact = Serializer._convert_generic(inJson=e,
dvField='datasetContactEmail',
dryField='email')
if reqdContact:
author = Serializer._convert_author_names(e)
author = {'author':author['authorName']['value']}
#for passing to function
author = Serializer._convert_generic(inJson=author,
dvField='datasetContactName',
dryField='author')
if author:
reqdContact.update(author)
affiliation = Serializer._convert_generic(inJson=e,
dvField='datasetContactAffiliation',
dryField='affiliation')
if affiliation:
reqdContact.update(affiliation)
out.append(reqdContact)
if defContact:
#Adds default contact information the tail of the list
defEmail = Serializer._convert_generic(inJson={'em':dvEmail},
dvField='datasetContactEmail',
dryField='em')
defName = Serializer._convert_generic(inJson={'name':dvContact},
dvField='datasetContactName',
dryField='name')
defEmail.update(defName)
out.append(defEmail)
contacts = Serializer._typeclass(typeName='datasetContact',
multiple=True, typeClass='compound')
contacts['value'] = out
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(contacts)
#Description
description = Serializer._typeclass(typeName='dsDescription',
multiple=True, typeClass='compound')
desCat = [('abstract', '<b>Abstract</b><br/>'),
('methods', '<b>Methods</b><br />'),
('usageNotes', '<b>Usage notes</b><br />')]
out = []
for desc in desCat:
if dryJson.get(desc[0]):
descrField = Serializer._convert_generic(inJson=dryJson,
dvField='dsDescriptionValue',
dryField=desc[0])
descrField['dsDescriptionValue']['value'] = (desc[1]
+ descrField['dsDescriptionValue']['value'])
descDate = Serializer._convert_generic(inJson=dryJson,
dvField='dsDescriptionDate',
dryField='lastModificationDate')
descrField.update(descDate)
out.append(descrField)
description['value'] = out
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(description)
#Granting agencies
if dryJson.get('funders'):
out = []
for fund in dryJson['funders']:
org = Serializer._convert_generic(inJson=fund,
dvField='grantNumberAgency',
dryField='organization')
if fund.get('awardNumber'):
fund = Serializer._convert_generic(inJson=fund,
dvField='grantNumberValue',
dryField='awardNumber')
org.update(fund)
out.append(org)
grants = Serializer._typeclass(typeName='grantNumber',
multiple=True, typeClass='compound')
grants['value'] = out
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(grants)
#Keywords
keywords = Serializer._typeclass(typeName='keyword',
multiple=True, typeClass='compound')
out = []
for key in dryJson.get('keywords', []):
#Apparently keywords are not required
keydict = {'keyword':key}
#because takes a dict
kv = Serializer._convert_generic(inJson=keydict,
dvField='keywordValue',
dryField='keyword')
vocab = {'dryad':'Dryad'}
voc = Serializer._convert_generic(inJson=vocab,
dvField='keywordVocabulary',
dryField='dryad')
kv.update(voc)
out.append(kv)
keywords['value'] = out
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(keywords)
#modification date
moddate = Serializer._convert_generic(inJson=dryJson,
dvField='dateOfDeposit',
dryField='lastModificationDate')
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(moddate['dateOfDeposit'])
#This one isn't nested BFY
#distribution date
distdate = Serializer._convert_generic(inJson=dryJson,
dvField='distributionDate',
dryField='publicationDate')
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(distdate['distributionDate'])
#Also not nested
#publications
publications = Serializer._typeclass(typeName='publication',
multiple=True,
typeClass='compound')
#quick and dirty lookup table
#TODONE see https://github.com/CDL-Dryad/dryad-app/blob/
#31d17d8dab7ea3bab1256063a1e4d0cb706dd5ec/stash/stash_datacite/
#app/models/stash_datacite/related_identifier.rb
#no longer required
#lookup = {'IsDerivedFrom':'Is derived from',
# 'Cites':'Cites',
# 'IsSupplementTo': 'Is supplement to',
# 'IsSupplementedBy': 'Is supplemented by'}
out = []
if dryJson.get('relatedWorks'):
for r in dryJson.get('relatedWorks'):
#id = r.get('identifier')
#TODONE Verify that changing id to _id has not broken anything: 11Feb21
_id = r.get('identifier')
#Note:10 Feb 2021 : some records have identifier = ''. BAD DRYAD.
if not _id:
continue
relationship = r.get('relationship')
#idType = r.get('identifierType') #not required in _convert_generic
#citation = {'citation': f"{lookup[relationship]}: {id}"}
citation = {'citation': relationship.capitalize()}
pubcite = Serializer._convert_generic(inJson=citation,
dvField='publicationCitation',
dryField='citation')
pubIdType = Serializer._convert_generic(inJson=r,
dvField='publicationIDType',
dryField='identifierType')
#ID type must be lower case
pubIdType['publicationIDType']['value'] = pubIdType['publicationIDType']['value'].lower()
pubIdType['publicationIDType']['typeClass'] = 'controlledVocabulary'
pubUrl = Serializer._convert_generic(inJson=r,
dvField='publicationURL',
dryField='identifier')
#Dryad doesn't just put URLs in their URL field.
if pubUrl['publicationURL']['value'].lower().startswith('doi:'):
fixurl = 'https://doi.org/' + pubUrl['publicationURL']['value'][4:]
pubUrl['publicationURL']['value'] = fixurl
LOGGER.debug('Rewrote URLs to be %s', fixurl)
#Dryad doesn't validate URL fields to start with http or https. Assume https
if not pubUrl['publicationURL']['value'].lower().startswith('htt'):
pubUrl['publicationURL']['value'] = ('https://' +
pubUrl['publicationURL']['value'])
pubcite.update(pubIdType)
pubcite.update(pubUrl)
out.append(pubcite)
publications['value'] = out
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(publications)
#notes
#go into primary notes field, not DDI
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(Serializer._convert_notes(dryJson))
#Geospatial metadata
self._dvJson['datasetVersion']['metadataBlocks'].update(Serializer._convert_geospatial(dryJson))
#DOI --> agency/identifier
doi = Serializer._convert_generic(inJson=dryJson, dryField='identifier',
dvField='otherIdValue')
doi.update(Serializer._convert_generic(inJson={'agency':'Dryad'},
dryField='agency',
dvField='otherIdAgency'))
agency = Serializer._typeclass(typeName='otherId',
multiple=True, typeClass='compound')
agency['value'] = [doi]
self._dvJson['datasetVersion']['metadataBlocks']['citation']['fields'].append(agency)
dryadJson
property
writable
¶
Returns Dryad study JSON. Will call Serializer.fetch_record() if no JSON is present.
dvJson
property
¶
Returns Dataverse study JSON as dict.
embargo
property
¶
Check embargo status. Returns boolean True if embargoed.
fileJson
property
¶
Returns a list of file JSONs from call to Dryad API /files/{id}, where the ID is parsed from the Dryad JSON. Dryad file listings are paginated, so the return consists of a list of dicts, one per page.
| Parameters: |
|
|---|
files
property
¶
Returns a list of tuples with:
(Download_location, filename, mimetype, size, description, digest, digestType )
Digest types include, but are not necessarily limited to:
‘adler-32’,’crc-32’,’md2’,’md5’,’sha-1’,’sha-256’, ‘sha-384’,’sha-512’
id
property
¶
Returns Dryad unique database ID, not the DOI.
Where the original Dryad JSON is dryadJson, it’s the integer trailing portion of:
self.dryadJson['_links']['stash:version']['href']
oversize
property
¶
Returns a list of Dryad files whose size value exceeds maxsize. Maximum size defaults to dryad2dataverse.constants.MAX_UPLOAD
| Parameters: |
|
|---|
__init__(doi)
¶
Creates Dryad study metadata instance.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/serializer.py
def __init__(self, doi):
'''
Creates Dryad study metadata instance.
Parameters
----------
doi : str
DOI of Dryad study. Required for downloading.
eg: 'doi:10.5061/dryad.2rbnzs7jp'
'''
self.doi = doi
self._dryadJson = None
self._fileJson = None
self._dvJson = None
#Serializer objects will be assigned a Dataverse study PID
#if dryad2Dataverse.transfer.Transfer() is instantiated
self.dvpid = None
self.session = requests.Session()
self.session.mount('https://',
HTTPAdapter(max_retries=constants.RETRY_STRATEGY))
LOGGER.debug('Creating Serializer instance object')
fetch_record(url=None, timeout=45)
¶
Fetches Dryad study record JSON from Dryad V2 API at https://datadryad.org/api/v2/datasets/. Saves to self._dryadJson. Querying Serializer.dryadJson will call this function automatically.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/serializer.py
def fetch_record(self, url=None, timeout=45):
'''
Fetches Dryad study record JSON from Dryad V2 API at
https://datadryad.org/api/v2/datasets/.
Saves to self._dryadJson. Querying Serializer.dryadJson
will call this function automatically.
Parameters
----------
url : str
Dryad instance base URL (eg: 'https://datadryad.org').
timeout : int
Timeout in seconds. Default 45.
'''
if not url:
url = constants.DRYURL
try:
headers = {'accept':'application/json',
'Content-Type':'application/json'}
headers.update(USER_AGENT)
doiClean = urllib.parse.quote(self.doi, safe='')
resp = self.session.get(f'{url}/api/v2/datasets/{doiClean}',
headers=headers, timeout=timeout)
resp.raise_for_status()
self._dryadJson = resp.json()
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.error('URL error for: %s', url)
LOGGER.exception(err)
raise
dryad2dataverse.transfer
¶
This module handles data downloads and uploads from a Dryad instance to a Dataverse instance
Transfer
¶
Transfers metadata and data files from a Dryad installation to Dataverse installation.
Source code in src/dryad2dataverse/transfer.py
class Transfer():
'''
Transfers metadata and data files from a
Dryad installation to Dataverse installation.
'''
def __init__(self, dryad):
'''
Creates a dryad2dataverse.transfer.Transfer instance.
Parameters
----------
dryad : dryad2dataverse.serializer.Serializer
'''
self.dryad = dryad
self._fileJson = None
self._files = [list(f) for f in self.dryad.files]
#self._files = copy.deepcopy(self.dryad.files)
self.fileUpRecord = []
self.fileDelRecord = []
self.dvStudy = None
self.jsonFlag = None #Whether or not new json uploaded
self.session = requests.Session()
self.session.mount('https://', HTTPAdapter(max_retries=constants.RETRY_STRATEGY))
def _del__(self): #TODONE: Change name to __del__ to make a destructor
'''Expunges files from constants.TMP on deletion'''
for f in self.files:
if os.path.exists(f'{constants.TMP}{os.sep}{f[1]}'):
os.remove(f'{constants.TMP}{os.sep}{f[1]}')
def test_api_key(self, url=None, apikey=None):
'''
Tests for an expired API key and raises
dryad2dataverse.exceptions.Dryad2dataverseBadApiKeyError
the API key is bad. Ignores other HTTP errors.
Parameters
----------
url : str
Base URL to Dataverse installation.
Defaults to dryad2dataverse.constants.DVURL
apikey : str
Default dryad2dataverse.constants.APIKEY.
'''
#API validity check appears to come before a PID validity check
params = {'persistentId': 'doi:000/000/000'} # PID is irrelevant
if not url:
url = constants.DVURL
headers = {'X-Dataverse-key': apikey if apikey else constants.APIKEY}
headers.update(USER_AGENT)
bad_test = self.session.get(f'{url}/api/datasets/:persistentId',
headers=headers,
params=params)
#There's an extra space in the message which Harvard
#will probably find out about, so . . .
if bad_test.json().get('message').startswith('Bad api key'):
try:
raise exceptions.DataverseBadApiKeyError('Bad API key')
except exceptions.DataverseBadApiKeyError as e:
LOGGER.critical('API key has expired or is otherwise invalid')
LOGGER.exception(e)
#LOGGER.exception(traceback.format_exc()) #not really necessary
raise
try: #other errors
bad_test.raise_for_status()
except requests.exceptions.HTTPError:
pass
except Exception as e:
LOGGER.exception(e)
LOGGER.exception(traceback.format_exc())
raise
@property
def dvpid(self):
'''
Returns Dataverse study persistent ID as str.
'''
return self.dryad.dvpid
@property
def auth(self):
'''
Returns datavese authentication header dict.
ie: `{X-Dataverse-key' : 'APIKEYSTRING'}`
'''
return {'X-Dataverse-key' : constants.APIKEY}
@property
def fileJson(self):
'''
Returns a list of file JSONs from call to Dryad API /files/{id},
where the ID is parsed from the Dryad JSON. Dryad file listings
are paginated.
'''
return self.dryad.fileJson.copy()
@property
def files(self):
'''
Returns a list of lists with:
[Download_location, filename, mimetype, size, description, md5digest]
This is mutable; downloading a file will add md5 info if not available.
'''
return self._files
@property
def oversize(self):
'''
Returns list of files exceeding Dataverse ingest limit
dryad2dataverse.constants.MAX_UPLOAD.
'''
return self.dryad.oversize
@property
def doi(self):
'''
Returns Dryad DOI.
'''
return self.dryad.doi
@staticmethod
def _dryad_file_id(url:str):
'''
Returns Dryad fileID from dryad file download URL as integer.
Parameters
----------
url : str
Dryad file URL in format
'https://datadryad.org/api/v2/files/385820/download'.
'''
fid = url.strip('/download')
fid = int(fid[fid.rfind('/')+1:])
return fid
@staticmethod
def _make_dv_head(apikey):
'''
Returns Dataverse authentication header as dict.
Parameters
----------
apikey : str
Dataverse API key.
'''
return {'X-Dataverse-key' : apikey}
#@staticmethod
def set_correct_date(self, url=None, hdl=None,
d_type='distributionDate',
apikey=None):
'''
Sets "correct" publication date for Dataverse.
Parameters
----------
url : str
Base URL to Dataverse installation.
Defaults to dryad2dataverse.constants.DVURL
hdl : str
Persistent indentifier for Dataverse study.
Defaults to Transfer.dvpid (which can be None if the
study has not yet been uploaded).
d_type : str
Date type. One of 'distributionDate', 'productionDate',
`dateOfDeposit'. Default 'distributionDate'.
apikey : str
Default dryad2dataverse.constants.APIKEY.
Notes
-----
dryad2dataverse.serializer maps Dryad 'publicationDate'
to Dataverse 'distributionDate' (see serializer.py ~line 675).
Dataverse citation date default is ":publicationDate". See
Dataverse API reference:
<https://guides.dataverse.org/en/4.20/api/native-api.html#id54>.
'''
try:
if not url:
url = constants.DVURL
if not hdl:
hdl = self.dvpid
headers = {'X-Dataverse-key' : apikey}
if apikey:
headers = {'X-Dataverse-key' : apikey}
else:
headers = {'X-Dataverse-key' : constants.APIKEY}
headers.update(USER_AGENT)
params = {'persistentId': hdl}
set_date = self.session.put(f'{url}/api/datasets/:persistentId/citationdate',
headers=headers,
data=d_type,
params=params,
timeout=45)
set_date.raise_for_status()
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.warning('Unable to set citation date for %s',
hdl)
LOGGER.warning(err)
LOGGER.warning(set_date.text)
def upload_study(self, url=None, apikey=None, timeout=45, **kwargs):
'''
Uploads Dryad study metadata to target Dataverse or updates existing.
Supplying a `targetDv` kwarg creates a new study and supplying a
`dvpid` kwarg updates a currently existing Dataverse study.
Parameters
----------
url : str
URL of Dataverse instance. Defaults to constants.DVURL.
apikey : str
API key of user. Defaults to contants.APIKEY.
timeout : int
timeout on POST request.
kwargs : dict
Other parameters
----------------
targetDv : str
Short name of target dataverse. Required if new dataset.
Specify as targetDV=value.
dvpid : str
Dataverse persistent ID (for updating metadata).
This is not required for new uploads, specify as dvpid=value
Notes
-----
One of targetDv or dvpid is required.
'''
if not url:
url = constants.DVURL
if not apikey:
apikey = constants.APIKEY
headers = {'X-Dataverse-key' : apikey}
headers.update(USER_AGENT)
targetDv = kwargs.get('targetDv')
dvpid = kwargs.get('dvpid')
#dryFid = kwargs.get('dryFid') #Why did I put this here?
if not targetDv and not dvpid:
try:
raise exceptions.NoTargetError('You must supply one of targetDv \
(target dataverse) \
or dvpid (Dataverse persistent ID)')
except exceptions.NoTargetError as e:
LOGGER.error('No target dataverse or dvpid supplied')
LOGGER.exception(e)
raise
if targetDv and dvpid:
try:
raise ValueError('Supply only one of targetDv or dvpid')
except ValueError as e:
LOGGER.exception(e)
raise
if not dvpid:
endpoint = f'{url}/api/dataverses/{targetDv}/datasets'
upload = self.session.post(endpoint,
headers=headers,
json=self.dryad.dvJson,
timeout=timeout)
LOGGER.debug(upload.text)
else:
endpoint = f'{url}/api/datasets/:persistentId/versions/:draft'
params = {'persistentId':dvpid}
#Yes, dataverse uses *different* json for edits
upload = self.session.put(endpoint, params=params,
headers=headers,
json=self.dryad.dvJson['datasetVersion'],
timeout=timeout)
#self._dvrecord = upload.json()
LOGGER.debug(upload.text)
try:
updata = upload.json()
self.dvStudy = updata
if updata.get('status') != 'OK':
try:
raise exceptions.DataverseUploadError(('Status return is not OK.'
f'{upload.status_code}: '
f'{upload.reason}. '
f'{upload.request.url} '
f'{upload.text}'))
except exceptions.DataverseUploadError as e:
LOGGER.exception(e)
LOGGER.exception(traceback.format_exc())
raise exceptions.DataverseUploadError(('Status return is not OK.'
f'{upload.status_code}: '
f'{upload.reason}. '
f'{upload.request.url} '
f'{upload.text}'))
upload.raise_for_status()
except Exception as e: # Only accessible via non-requests exception
LOGGER.exception(e)
LOGGER.exception(traceback.format_exc())
raise
if targetDv:
self.dryad.dvpid = updata['data'].get('persistentId')
if dvpid:
self.dryad.dvpid = updata['data'].get('datasetPersistentId')
return self.dvpid
@staticmethod
def _check_md5(infile, dig_type):
'''
Returns the hex digest of a file (formerly just md5sum).
Parameters
----------
infile : str
Complete path to target file.
dig_type : Union[str, None]
Digest type
'''
#From Ryan Scherle
#When Dryad calculates a digest, it only uses MD5.
#But if you have precomputed some other type of digest, we should accept it.
#The list of allowed values is:
#('adler-32','crc-32','md2','md5','sha-1','sha-256','sha-384','sha-512')
#hashlib doesn't support adler-32, crc-32, md2
blocksize = 2**16
#Well, this is inelegant
with open(infile, 'rb') as m:
#fmd5 = hashlib.md5()
## var name kept for posterity. Maybe refactor
if dig_type in ['sha-1', 'sha-256', 'sha-384', 'sha-512', 'md5', 'md2']:
if dig_type == 'md2':
fmd5 = Crypto.Hash.MD2.new()
else:
fmd5 = HASHTABLE[dig_type]()
fblock = m.read(blocksize)
while fblock:
fmd5.update(fblock)
fblock = m.read(blocksize)
return fmd5.hexdigest()
if dig_type in ['adler-32', 'crc-32']:
fblock = m.read(blocksize)
curvalue = HASHTABLE[dig_type](fblock)
while fblock:
fblock = m.read(blocksize)
curvalue = HASHTABLE[dig_type](fblock, curvalue)
return curvalue
raise exceptions.HashError(f'Unable to determine hash type for{infile}: {dig_type}')
def download_file(self, url=None, filename=None, tmp=None,
size=None, chk=None, timeout=45, **kwargs):
'''
Downloads a file via requests streaming and saves to constants.TMP.
returns checksum on success and an exception on failure.
Parameters
----------
url : str
URL of download.
filename : str
Output file name.
timeout : int
Requests timeout.
tmp : str
Temporary directory for downloads.
Defaults to dryad2dataverse.constants.TMP.
size : int
Reported file size in bytes.
Defaults to dryad2dataverse.constants.MAX_UPLOAD.
chk : str
checksum of file (if available and known).
timeout : int
timeout in seconds
kwargs : dict
Other parameters
----------------
digest_type : str
checksum type (ie, md5, sha-256, etc)
'''
LOGGER.debug('Start download sequence')
LOGGER.debug('MAX SIZE = %s', constants.MAX_UPLOAD)
LOGGER.debug('Filename: %s, size=%s', filename, size)
if not tmp:
tmp = constants.TMP
if tmp.endswith(os.sep):
tmp = tmp[:-1]
if size:
if size > constants.MAX_UPLOAD:
#TOO BIG
LOGGER.warning('%s: File %s exceeds '
'Dataverse MAX_UPLOAD size. Skipping download.',
self.doi, filename)
md5 = 'this_file_is_too_big_to_upload__' #HA HA
for i in self._files:
if url == i[0]:
i[-1] = md5
LOGGER.debug('Stop download sequence with large file skip')
return md5
try:
down = self.session.get(url, timeout=timeout, stream=True)
down.raise_for_status()
with open(f'{tmp}{os.sep}{filename}', 'wb') as fi:
for chunk in down.iter_content(chunk_size=8192):
fi.write(chunk)
#verify size
#https://stackoverflow.com/questions/2104080/how-can-i-check-file-size-in-python'
if size:
checkSize = os.stat(f'{tmp}{os.sep}{filename}').st_size
if checkSize != size:
try:
raise exceptions.DownloadSizeError('Download size does not match '
'reported size')
except exceptions.DownloadSizeError as e:
LOGGER.exception(e)
raise
#now check the md5
md5 = None
if chk and kwargs.get('digest_type') in HASHTABLE:
md5 = Transfer._check_md5(f'{tmp}{os.sep}{filename}',
kwargs['digest_type'])
if md5 != chk:
try:
raise exceptions.HashError(f'Hex digest mismatch: {md5} : {chk}')
#is this really what I want to do on a bad checksum?
except exceptions.HashError as e:
LOGGER.exception(e)
raise
for i in self._files:
if url == i[0]:
i[-1] = md5
LOGGER.debug('Complete download sequence')
#This doesn't actually return an md5, just the hash value
return md5
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.critical('Unable to download %s', url)
LOGGER.exception(err)
raise exceptions.DataverseDownloadError
def download_files(self, files=None):
'''
Bulk downloader for files.
Parameters
----------
files : list
Items in list can be tuples or list with a minimum of:
`(dryaddownloadurl, filenamewithoutpath, [md5sum])`
The md5 sum should be the last member of the tuple.
Defaults to self.files.
Notes
-----
Normally used without arguments to download all the associated
files with a Dryad study.
'''
if not files:
files = self.files
try:
for f in files:
self.download_file(url=f[0],
filename=f[1],
mimetype=f[2],
size=f[3],
descr=f[4],
digest_type=f[5],
chk=f[-1])
except exceptions.DataverseDownloadError as e:
LOGGER.exception('Unable to download file with info %s\n%s', f, e)
raise
def file_lock_check(self, study, dv_url, apikey=None, count=0):
'''
Checks for a study lock
Returns True if locked. Normally used to check
if processing is completed. As tabular processing
halts file ingest, there should be no locks on a
Dataverse study before performing a data file upload.
Parameters
----------
study : str
Persistent indentifer of study.
dv_url : str
URL to base Dataverse installation.
apikey : str
API key for user.
If not present authorization defaults to self.auth.
count : int
Number of times the function has been called. Logs
lock messages only on 0.
'''
if dv_url.endswith('/'):
dv_url = dv_url[:-1]
if apikey:
headers = {'X-Dataverse-key': apikey}
else:
headers = self.auth
headers.update(USER_AGENT)
params = {'persistentId': study}
try:
lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
headers=headers,
params=params, timeout=300)
lock_status.raise_for_status()
if lock_status.json().get('data'):
if count == 0:
LOGGER.warning('Study %s has been locked', study)
LOGGER.warning('Lock info:\n%s', lock_status.json())
return True
return False
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.error('Unable to detect lock status for %s', study)
LOGGER.error('ERROR message: %s', lock_status.text)
LOGGER.exception(err)
#return True #Should I raise here?
raise
def force_notab_unlock(self, study, dv_url, apikey=None):
'''
Checks for a study lock and forcibly unlocks and uningests
to prevent tabular file processing. Required if mime and filename
spoofing is not sufficient.
**Forcible unlocks require a superuser API key.**
Parameters
----------
study : str
Persistent indentifer of study.
dv_url : str
URL to base Dataverse installation.
apikey : str
API key for user.
If not present authorization defaults to self.auth.
'''
if dv_url.endswith('/'):
dv_url = dv_url[:-1]
if apikey:
headers = {'X-Dataverse-key': apikey}
else:
headers = self.auth
headers.update(USER_AGENT)
params = {'persistentId': study}
lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
headers=headers,
params=params, timeout=300)
lock_status.raise_for_status()
if lock_status.json()['data']:
LOGGER.warning('Study %s has been locked', study)
LOGGER.warning('Lock info:\n%s', lock_status.json())
force_unlock = self.session.delete(f'{dv_url}/api/datasets/:persistentId/locks',
params=params, headers=headers,
timeout=300)
force_unlock.raise_for_status()
LOGGER.warning('Lock removed for %s', study)
LOGGER.warning('Lock status:\n %s', force_unlock.json())
#This is what the file ID was for, in case it can
#be implemented again.
#According to Harvard, you can't remove the progress bar
#for uploaded tab files that squeak through unless you
#let them ingest first then reingest them. Oh well.
#See:
#https://groups.google.com/d/msgid/dataverse-community/
#74caa708-e39b-4259-874d-5b6b74ef9723n%40googlegroups.com
#Also, you can't uningest it because it hasn't been
#ingested once it's been unlocked. So the commented
#code below is useless (for now)
#uningest = requests.post(f'{dv_url}/api/files/{fid}/uningest',
# headers=headers,
# timeout=300)
#LOGGER.warning('Ingest halted for file %s for study %s', fid, study)
#uningest.raise_for_status()
def upload_file(self, dryadUrl=None, filename=None,
mimetype=None, size=None, descr=None,
hashtype=None,
#md5=None, studyId=None, dest=None,
digest=None, studyId=None, dest=None,
fprefix=None, force_unlock=False, timeout=300):
'''
Uploads file to Dataverse study. Returns a tuple of the
dryadFid (or None) and Dataverse JSON from the POST request.
Failures produce JSON with different status messages
rather than raising an exception.
Parameters
----------
filename : str
Filename (not including path).
mimetype : str
Mimetype of file.
size : int
Size in bytes.
studyId : str
Persistent Dataverse study identifier.
Defaults to Transfer.dvpid.
dest : str
Destination dataverse installation url.
Defaults to constants.DVURL.
hashtype: str
original Dryad hash type
fprefix : str
Path to file, not including a trailing slash.
timeout : int
Timeout in seconds for POST request. Default 300.
dryadUrl : str
Dryad download URL if you want to include a Dryad file id.
force_unlock : bool
Attempt forcible unlock instead of waiting for tabular
file processing.
Defaults to False.
The Dataverse `/locks` endpoint blocks POST and DELETE requests
from non-superusers (undocumented as of 31 March 2021).
**Forcible unlock requires a superuser API key.**
'''
#return locals()
#TODONE remove above
if not studyId:
studyId = self.dvpid
if not dest:
dest = constants.DVURL
if not fprefix:
fprefix = constants.TMP
if dryadUrl:
fid = dryadUrl.strip('/download')
fid = int(fid[fid.rfind('/')+1:])
else:
fid = 0 #dummy fid for non-Dryad use
params = {'persistentId' : studyId}
upfile = fprefix + os.sep + filename[:]
badExt = filename[filename.rfind('.'):].lower()
#Descriptions are technically possible, although how to add
#them is buried in Dryad's API documentation
dv4meta = {'label' : filename[:], 'description' : descr}
#if mimetype == 'application/zip' or filename.lower().endswith('.zip'):
if mimetype == 'application/zip' or badExt in constants.NOTAB:
mimetype = 'application/octet-stream' # stop unzipping automatically
filename += '.NOPROCESS' # Also screw with their naming convention
#debug log about file names to see what is up with XSLX
#see doi:10.5061/dryad.z8w9ghxb6
LOGGER.debug('File renamed to %s for upload', filename)
if size >= constants.MAX_UPLOAD:
fail = (fid, {'status' : 'Failure: MAX_UPLOAD size exceeded'})
self.fileUpRecord.append(fail)
LOGGER.warning('%s: File %s of '
'size %s exceeds '
'Dataverse MAX_UPLOAD size. Skipping.', self.doi, filename, size)
return fail
fields = {'file': (filename, open(upfile, 'rb'), mimetype)}
fields.update({'jsonData': f'{dv4meta}'})
multi = MultipartEncoder(fields=fields)
ctype = {'Content-type' : multi.content_type}
tmphead = self.auth.copy()
tmphead.update(ctype)
tmphead.update(USER_AGENT)
url = dest + '/api/datasets/:persistentId/add'
try:
upload = self.session.post(url, params=params,
headers=tmphead,
data=multi, timeout=timeout)
#print(upload.text)
upload.raise_for_status()
self.fileUpRecord.append((fid, upload.json()))
upmd5 = upload.json()['data']['files'][0]['dataFile']['checksum']['value']
#Dataverse hash type
_type = upload.json()['data']['files'][0]['dataFile']['checksum']['type']
if _type.lower() != hashtype.lower():
comparator = self._check_md5(upfile, _type.lower())
else:
comparator = digest
#if hashtype.lower () != 'md5':
# #get an md5 because dataverse uses md5s. Or most of them do anyway.
# #One day this will be rewritten properly.
# md5 = self._check_md5(filename, 'md5')
#else:
# md5 = digest
#if md5 and (upmd5 != md5):
if upmd5 != comparator:
try:
raise exceptions.HashError(f'{_type} mismatch:\nlocal: {comparator}\nuploaded: {upmd5}')
except exceptions.HashError as e:
LOGGER.exception(e)
raise
#Make damn sure that the study isn't locked because of
#tab file processing
##SPSS files still process despite spoofing MIME and extension
##so there's also a forcible unlock check
#fid = upload.json()['data']['files'][0]['dataFile']['id']
#fid not required for unlock
#self.force_notab_unlock(studyId, dest, fid)
if force_unlock:
self.force_notab_unlock(studyId, dest)
else:
count = 0
wait = True
while wait:
wait = self.file_lock_check(studyId, dest, count=count)
if wait:
time.sleep(15) # Don't hit it too often
count += 1
return (fid, upload.json())
except Exception as e:
LOGGER.exception(e)
try:
reason = upload.json()['message']
LOGGER.warning(upload.json())
return (fid, {'status' : f'Failure: {reason}'})
except Exception as e:
LOGGER.warning('Further exceptions!')
LOGGER.exception(e)
LOGGER.warning(upload.text)
return (fid, {'status' : f'Failure: Reason {upload.reason}'})
def upload_files(self, files=None, pid=None, fprefix=None, force_unlock=False):
'''
Uploads multiple files to study with persistentId pid.
Returns a list of the original tuples plus JSON responses.
Parameters
----------
files : list
List contains tuples with
(dryadDownloadURL, filename, mimetype, size).
pid : str
Defaults to self.dvpid, which is generated by calling
dryad2dataverse.transfer.Transfer.upload_study().
fprefix : str
File location prefix.
Defaults to dryad2dataverse.constants.TMP
force_unlock : bool
Attempt forcible unlock instead of waiting for tabular
file processing.
Defaults to False.
The Dataverse `/locks` endpoint blocks POST and DELETE requests
from non-superusers (undocumented as of 31 March 2021).
**Forcible unlock requires a superuser API key.**
'''
if not files:
files = self.files
if not fprefix:
fprefix = constants.TMP
out = []
for f in files:
#out.append(self.upload_file(f[0], f[1], f[2], f[3],
# f[4], f[5], pid, fprefix=fprefix))
#out.append(self.upload_file(*[x for x in f],
#last item in files is not necessary
out.append(self.upload_file(*list(f)[:-1],
studyId=pid, fprefix=fprefix,
force_unlock=force_unlock))
return out
def upload_json(self, studyId=None, dest=None):
'''
Uploads Dryad json as a separate file for archival purposes.
Parameters
----------
studyId : str
Dataverse persistent identifier.
Default dryad2dataverse.transfer.Transfer.dvpid,
which is only generated on
dryad2dataverse.transfer.Transfer.upload_study()
dest : str
Base URL for transfer.
Default dryad2datavese.constants.DVURL
'''
if not studyId:
studyId = self.dvpid
if not dest:
dest = constants.DVURL
if not self.jsonFlag:
url = dest + '/api/datasets/:persistentId/add'
pack = io.StringIO(json.dumps(self.dryad.dryadJson))
desc = {'description':'Original JSON from Dryad',
'categories':['Documentation', 'Code']}
fname = self.doi[self.doi.rfind('/')+1:].replace('.', '_')
payload = {'file': (f'{fname}.json', pack, 'text/plain;charset=UTF-8'),
'jsonData':f'{desc}'}
params = {'persistentId':studyId}
try:
meta = self.session.post(f'{url}',
params=params,
headers=self.auth,
files=payload)
#0 because no dryad fid will be zero
meta.raise_for_status()
self.fileUpRecord.append((0, meta.json()))
self.jsonFlag = (0, meta.json())
LOGGER.debug('Successfully uploaded Dryad JSON to %s', studyId)
#JSON uploads randomly fail with a Dataverse server.log error of
#"A system exception occurred during an invocation on EJB . . ."
#Not reproducible, so errors will only be written to the log.
#Jesus.
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.error('Unable to upload Dryad JSON to %s', studyId)
LOGGER.error('ERROR message: %s', meta.text)
LOGGER.exception(err)
#And further checking as to what is happening
self.fileUpRecord.append((0, {'status':'Failure: Unable to upload Dryad JSON'}))
if not isinstance(self.dryad.dryadJson, dict):
LOGGER.error('Dryad JSON is not a dictionary')
except Exception as err:
LOGGER.error('Unable to upload Dryad JSON')
LOGGER.exception(err)
def delete_dv_file(self, dvfid, dvurl=None, key=None)->bool:
#WTAF curl -u $API_TOKEN: -X DELETE
#https://$HOSTNAME/dvn/api/data-deposit/v1.1/swordv2/edit-media/file/123
'''
Deletes files from Dataverse target given a dataverse file ID.
This information is unknowable unless discovered by
dryad2dataverse.monitor.Monitor or by other methods.
Returns 1 on success (204 response), or 0 on other response.
Parameters
----------
dvurl : str
Base URL of dataverse instance.
Defaults to dryad2dataverse.constants.DVURL.
dvfid : str
Dataverse file ID number.
key : str
API key
'''
if not dvurl:
dvurl = constants.DVURL
if not key:
key = constants.APIKEY
delme = self.session.delete(f'{dvurl}/dvn/api/data-deposit/v1.1/swordv2/edit-media'
f'/file/{dvfid}',
auth=(key, ''))
if delme.status_code == 204:
self.fileDelRecord.append(dvfid)
return 1
return 0
def delete_dv_files(self, dvfids=None, dvurl=None, key=None):
'''
Deletes all files in list of Dataverse file ids from
a Dataverse installation.
Parameters
----------
dvfids : list
List of Dataverse file ids.
Defaults to dryad2dataverse.transfer.Transfer.fileDelRecord.
dvurl : str
Base URL of Dataverse. Defaults to dryad2dataverse.constants.DVURL.
key : str
API key for Dataverse. Defaults to dryad2dataverse.constants.APIKEY.
'''
#if not dvfids:
# dvfids = self.fileDelRecord
if not dvurl:
dvurl = constants.DVURL
if not key:
key = constants.APIKEY
for fid in dvfids:
self.delete_dv_file(fid, dvurl, key)
auth
property
¶
Returns datavese authentication header dict.
ie: {X-Dataverse-key' : 'APIKEYSTRING'}
doi
property
¶
Returns Dryad DOI.
dvpid
property
¶
Returns Dataverse study persistent ID as str.
fileJson
property
¶
Returns a list of file JSONs from call to Dryad API /files/{id}, where the ID is parsed from the Dryad JSON. Dryad file listings are paginated.
files
property
¶
Returns a list of lists with:
[Download_location, filename, mimetype, size, description, md5digest]
This is mutable; downloading a file will add md5 info if not available.
oversize
property
¶
Returns list of files exceeding Dataverse ingest limit dryad2dataverse.constants.MAX_UPLOAD.
__init__(dryad)
¶
Creates a dryad2dataverse.transfer.Transfer instance.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def __init__(self, dryad):
'''
Creates a dryad2dataverse.transfer.Transfer instance.
Parameters
----------
dryad : dryad2dataverse.serializer.Serializer
'''
self.dryad = dryad
self._fileJson = None
self._files = [list(f) for f in self.dryad.files]
#self._files = copy.deepcopy(self.dryad.files)
self.fileUpRecord = []
self.fileDelRecord = []
self.dvStudy = None
self.jsonFlag = None #Whether or not new json uploaded
self.session = requests.Session()
self.session.mount('https://', HTTPAdapter(max_retries=constants.RETRY_STRATEGY))
delete_dv_file(dvfid, dvurl=None, key=None)
¶
Deletes files from Dataverse target given a dataverse file ID. This information is unknowable unless discovered by dryad2dataverse.monitor.Monitor or by other methods.
Returns 1 on success (204 response), or 0 on other response.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def delete_dv_file(self, dvfid, dvurl=None, key=None)->bool:
#WTAF curl -u $API_TOKEN: -X DELETE
#https://$HOSTNAME/dvn/api/data-deposit/v1.1/swordv2/edit-media/file/123
'''
Deletes files from Dataverse target given a dataverse file ID.
This information is unknowable unless discovered by
dryad2dataverse.monitor.Monitor or by other methods.
Returns 1 on success (204 response), or 0 on other response.
Parameters
----------
dvurl : str
Base URL of dataverse instance.
Defaults to dryad2dataverse.constants.DVURL.
dvfid : str
Dataverse file ID number.
key : str
API key
'''
if not dvurl:
dvurl = constants.DVURL
if not key:
key = constants.APIKEY
delme = self.session.delete(f'{dvurl}/dvn/api/data-deposit/v1.1/swordv2/edit-media'
f'/file/{dvfid}',
auth=(key, ''))
if delme.status_code == 204:
self.fileDelRecord.append(dvfid)
return 1
return 0
delete_dv_files(dvfids=None, dvurl=None, key=None)
¶
Deletes all files in list of Dataverse file ids from a Dataverse installation.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def delete_dv_files(self, dvfids=None, dvurl=None, key=None):
'''
Deletes all files in list of Dataverse file ids from
a Dataverse installation.
Parameters
----------
dvfids : list
List of Dataverse file ids.
Defaults to dryad2dataverse.transfer.Transfer.fileDelRecord.
dvurl : str
Base URL of Dataverse. Defaults to dryad2dataverse.constants.DVURL.
key : str
API key for Dataverse. Defaults to dryad2dataverse.constants.APIKEY.
'''
#if not dvfids:
# dvfids = self.fileDelRecord
if not dvurl:
dvurl = constants.DVURL
if not key:
key = constants.APIKEY
for fid in dvfids:
self.delete_dv_file(fid, dvurl, key)
download_file(url=None, filename=None, tmp=None, size=None, chk=None, timeout=45, **kwargs)
¶
Downloads a file via requests streaming and saves to constants.TMP. returns checksum on success and an exception on failure.
| Parameters: |
|
|---|
|
Source code in src/dryad2dataverse/transfer.py
def download_file(self, url=None, filename=None, tmp=None,
size=None, chk=None, timeout=45, **kwargs):
'''
Downloads a file via requests streaming and saves to constants.TMP.
returns checksum on success and an exception on failure.
Parameters
----------
url : str
URL of download.
filename : str
Output file name.
timeout : int
Requests timeout.
tmp : str
Temporary directory for downloads.
Defaults to dryad2dataverse.constants.TMP.
size : int
Reported file size in bytes.
Defaults to dryad2dataverse.constants.MAX_UPLOAD.
chk : str
checksum of file (if available and known).
timeout : int
timeout in seconds
kwargs : dict
Other parameters
----------------
digest_type : str
checksum type (ie, md5, sha-256, etc)
'''
LOGGER.debug('Start download sequence')
LOGGER.debug('MAX SIZE = %s', constants.MAX_UPLOAD)
LOGGER.debug('Filename: %s, size=%s', filename, size)
if not tmp:
tmp = constants.TMP
if tmp.endswith(os.sep):
tmp = tmp[:-1]
if size:
if size > constants.MAX_UPLOAD:
#TOO BIG
LOGGER.warning('%s: File %s exceeds '
'Dataverse MAX_UPLOAD size. Skipping download.',
self.doi, filename)
md5 = 'this_file_is_too_big_to_upload__' #HA HA
for i in self._files:
if url == i[0]:
i[-1] = md5
LOGGER.debug('Stop download sequence with large file skip')
return md5
try:
down = self.session.get(url, timeout=timeout, stream=True)
down.raise_for_status()
with open(f'{tmp}{os.sep}{filename}', 'wb') as fi:
for chunk in down.iter_content(chunk_size=8192):
fi.write(chunk)
#verify size
#https://stackoverflow.com/questions/2104080/how-can-i-check-file-size-in-python'
if size:
checkSize = os.stat(f'{tmp}{os.sep}{filename}').st_size
if checkSize != size:
try:
raise exceptions.DownloadSizeError('Download size does not match '
'reported size')
except exceptions.DownloadSizeError as e:
LOGGER.exception(e)
raise
#now check the md5
md5 = None
if chk and kwargs.get('digest_type') in HASHTABLE:
md5 = Transfer._check_md5(f'{tmp}{os.sep}{filename}',
kwargs['digest_type'])
if md5 != chk:
try:
raise exceptions.HashError(f'Hex digest mismatch: {md5} : {chk}')
#is this really what I want to do on a bad checksum?
except exceptions.HashError as e:
LOGGER.exception(e)
raise
for i in self._files:
if url == i[0]:
i[-1] = md5
LOGGER.debug('Complete download sequence')
#This doesn't actually return an md5, just the hash value
return md5
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.critical('Unable to download %s', url)
LOGGER.exception(err)
raise exceptions.DataverseDownloadError
download_files(files=None)
¶
Bulk downloader for files.
| Parameters: |
|
|---|
Notes
Normally used without arguments to download all the associated files with a Dryad study.
Source code in src/dryad2dataverse/transfer.py
def download_files(self, files=None):
'''
Bulk downloader for files.
Parameters
----------
files : list
Items in list can be tuples or list with a minimum of:
`(dryaddownloadurl, filenamewithoutpath, [md5sum])`
The md5 sum should be the last member of the tuple.
Defaults to self.files.
Notes
-----
Normally used without arguments to download all the associated
files with a Dryad study.
'''
if not files:
files = self.files
try:
for f in files:
self.download_file(url=f[0],
filename=f[1],
mimetype=f[2],
size=f[3],
descr=f[4],
digest_type=f[5],
chk=f[-1])
except exceptions.DataverseDownloadError as e:
LOGGER.exception('Unable to download file with info %s\n%s', f, e)
raise
file_lock_check(study, dv_url, apikey=None, count=0)
¶
Checks for a study lock
Returns True if locked. Normally used to check if processing is completed. As tabular processing halts file ingest, there should be no locks on a Dataverse study before performing a data file upload.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def file_lock_check(self, study, dv_url, apikey=None, count=0):
'''
Checks for a study lock
Returns True if locked. Normally used to check
if processing is completed. As tabular processing
halts file ingest, there should be no locks on a
Dataverse study before performing a data file upload.
Parameters
----------
study : str
Persistent indentifer of study.
dv_url : str
URL to base Dataverse installation.
apikey : str
API key for user.
If not present authorization defaults to self.auth.
count : int
Number of times the function has been called. Logs
lock messages only on 0.
'''
if dv_url.endswith('/'):
dv_url = dv_url[:-1]
if apikey:
headers = {'X-Dataverse-key': apikey}
else:
headers = self.auth
headers.update(USER_AGENT)
params = {'persistentId': study}
try:
lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
headers=headers,
params=params, timeout=300)
lock_status.raise_for_status()
if lock_status.json().get('data'):
if count == 0:
LOGGER.warning('Study %s has been locked', study)
LOGGER.warning('Lock info:\n%s', lock_status.json())
return True
return False
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.error('Unable to detect lock status for %s', study)
LOGGER.error('ERROR message: %s', lock_status.text)
LOGGER.exception(err)
#return True #Should I raise here?
raise
force_notab_unlock(study, dv_url, apikey=None)
¶
Checks for a study lock and forcibly unlocks and uningests to prevent tabular file processing. Required if mime and filename spoofing is not sufficient.
Forcible unlocks require a superuser API key.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def force_notab_unlock(self, study, dv_url, apikey=None):
'''
Checks for a study lock and forcibly unlocks and uningests
to prevent tabular file processing. Required if mime and filename
spoofing is not sufficient.
**Forcible unlocks require a superuser API key.**
Parameters
----------
study : str
Persistent indentifer of study.
dv_url : str
URL to base Dataverse installation.
apikey : str
API key for user.
If not present authorization defaults to self.auth.
'''
if dv_url.endswith('/'):
dv_url = dv_url[:-1]
if apikey:
headers = {'X-Dataverse-key': apikey}
else:
headers = self.auth
headers.update(USER_AGENT)
params = {'persistentId': study}
lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
headers=headers,
params=params, timeout=300)
lock_status.raise_for_status()
if lock_status.json()['data']:
LOGGER.warning('Study %s has been locked', study)
LOGGER.warning('Lock info:\n%s', lock_status.json())
force_unlock = self.session.delete(f'{dv_url}/api/datasets/:persistentId/locks',
params=params, headers=headers,
timeout=300)
force_unlock.raise_for_status()
LOGGER.warning('Lock removed for %s', study)
LOGGER.warning('Lock status:\n %s', force_unlock.json())
set_correct_date(url=None, hdl=None, d_type='distributionDate', apikey=None)
¶
Sets “correct” publication date for Dataverse.
| Parameters: |
|
|---|
Notes
dryad2dataverse.serializer maps Dryad ‘publicationDate’ to Dataverse ‘distributionDate’ (see serializer.py ~line 675).
Dataverse citation date default is “:publicationDate”. See Dataverse API reference: https://guides.dataverse.org/en/4.20/api/native-api.html#id54.
Source code in src/dryad2dataverse/transfer.py
def set_correct_date(self, url=None, hdl=None,
d_type='distributionDate',
apikey=None):
'''
Sets "correct" publication date for Dataverse.
Parameters
----------
url : str
Base URL to Dataverse installation.
Defaults to dryad2dataverse.constants.DVURL
hdl : str
Persistent indentifier for Dataverse study.
Defaults to Transfer.dvpid (which can be None if the
study has not yet been uploaded).
d_type : str
Date type. One of 'distributionDate', 'productionDate',
`dateOfDeposit'. Default 'distributionDate'.
apikey : str
Default dryad2dataverse.constants.APIKEY.
Notes
-----
dryad2dataverse.serializer maps Dryad 'publicationDate'
to Dataverse 'distributionDate' (see serializer.py ~line 675).
Dataverse citation date default is ":publicationDate". See
Dataverse API reference:
<https://guides.dataverse.org/en/4.20/api/native-api.html#id54>.
'''
try:
if not url:
url = constants.DVURL
if not hdl:
hdl = self.dvpid
headers = {'X-Dataverse-key' : apikey}
if apikey:
headers = {'X-Dataverse-key' : apikey}
else:
headers = {'X-Dataverse-key' : constants.APIKEY}
headers.update(USER_AGENT)
params = {'persistentId': hdl}
set_date = self.session.put(f'{url}/api/datasets/:persistentId/citationdate',
headers=headers,
data=d_type,
params=params,
timeout=45)
set_date.raise_for_status()
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.warning('Unable to set citation date for %s',
hdl)
LOGGER.warning(err)
LOGGER.warning(set_date.text)
test_api_key(url=None, apikey=None)
¶
Tests for an expired API key and raises dryad2dataverse.exceptions.Dryad2dataverseBadApiKeyError the API key is bad. Ignores other HTTP errors.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def test_api_key(self, url=None, apikey=None):
'''
Tests for an expired API key and raises
dryad2dataverse.exceptions.Dryad2dataverseBadApiKeyError
the API key is bad. Ignores other HTTP errors.
Parameters
----------
url : str
Base URL to Dataverse installation.
Defaults to dryad2dataverse.constants.DVURL
apikey : str
Default dryad2dataverse.constants.APIKEY.
'''
#API validity check appears to come before a PID validity check
params = {'persistentId': 'doi:000/000/000'} # PID is irrelevant
if not url:
url = constants.DVURL
headers = {'X-Dataverse-key': apikey if apikey else constants.APIKEY}
headers.update(USER_AGENT)
bad_test = self.session.get(f'{url}/api/datasets/:persistentId',
headers=headers,
params=params)
#There's an extra space in the message which Harvard
#will probably find out about, so . . .
if bad_test.json().get('message').startswith('Bad api key'):
try:
raise exceptions.DataverseBadApiKeyError('Bad API key')
except exceptions.DataverseBadApiKeyError as e:
LOGGER.critical('API key has expired or is otherwise invalid')
LOGGER.exception(e)
#LOGGER.exception(traceback.format_exc()) #not really necessary
raise
try: #other errors
bad_test.raise_for_status()
except requests.exceptions.HTTPError:
pass
except Exception as e:
LOGGER.exception(e)
LOGGER.exception(traceback.format_exc())
raise
upload_file(dryadUrl=None, filename=None, mimetype=None, size=None, descr=None, hashtype=None, digest=None, studyId=None, dest=None, fprefix=None, force_unlock=False, timeout=300)
¶
Uploads file to Dataverse study. Returns a tuple of the dryadFid (or None) and Dataverse JSON from the POST request. Failures produce JSON with different status messages rather than raising an exception.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def upload_file(self, dryadUrl=None, filename=None,
mimetype=None, size=None, descr=None,
hashtype=None,
#md5=None, studyId=None, dest=None,
digest=None, studyId=None, dest=None,
fprefix=None, force_unlock=False, timeout=300):
'''
Uploads file to Dataverse study. Returns a tuple of the
dryadFid (or None) and Dataverse JSON from the POST request.
Failures produce JSON with different status messages
rather than raising an exception.
Parameters
----------
filename : str
Filename (not including path).
mimetype : str
Mimetype of file.
size : int
Size in bytes.
studyId : str
Persistent Dataverse study identifier.
Defaults to Transfer.dvpid.
dest : str
Destination dataverse installation url.
Defaults to constants.DVURL.
hashtype: str
original Dryad hash type
fprefix : str
Path to file, not including a trailing slash.
timeout : int
Timeout in seconds for POST request. Default 300.
dryadUrl : str
Dryad download URL if you want to include a Dryad file id.
force_unlock : bool
Attempt forcible unlock instead of waiting for tabular
file processing.
Defaults to False.
The Dataverse `/locks` endpoint blocks POST and DELETE requests
from non-superusers (undocumented as of 31 March 2021).
**Forcible unlock requires a superuser API key.**
'''
#return locals()
#TODONE remove above
if not studyId:
studyId = self.dvpid
if not dest:
dest = constants.DVURL
if not fprefix:
fprefix = constants.TMP
if dryadUrl:
fid = dryadUrl.strip('/download')
fid = int(fid[fid.rfind('/')+1:])
else:
fid = 0 #dummy fid for non-Dryad use
params = {'persistentId' : studyId}
upfile = fprefix + os.sep + filename[:]
badExt = filename[filename.rfind('.'):].lower()
#Descriptions are technically possible, although how to add
#them is buried in Dryad's API documentation
dv4meta = {'label' : filename[:], 'description' : descr}
#if mimetype == 'application/zip' or filename.lower().endswith('.zip'):
if mimetype == 'application/zip' or badExt in constants.NOTAB:
mimetype = 'application/octet-stream' # stop unzipping automatically
filename += '.NOPROCESS' # Also screw with their naming convention
#debug log about file names to see what is up with XSLX
#see doi:10.5061/dryad.z8w9ghxb6
LOGGER.debug('File renamed to %s for upload', filename)
if size >= constants.MAX_UPLOAD:
fail = (fid, {'status' : 'Failure: MAX_UPLOAD size exceeded'})
self.fileUpRecord.append(fail)
LOGGER.warning('%s: File %s of '
'size %s exceeds '
'Dataverse MAX_UPLOAD size. Skipping.', self.doi, filename, size)
return fail
fields = {'file': (filename, open(upfile, 'rb'), mimetype)}
fields.update({'jsonData': f'{dv4meta}'})
multi = MultipartEncoder(fields=fields)
ctype = {'Content-type' : multi.content_type}
tmphead = self.auth.copy()
tmphead.update(ctype)
tmphead.update(USER_AGENT)
url = dest + '/api/datasets/:persistentId/add'
try:
upload = self.session.post(url, params=params,
headers=tmphead,
data=multi, timeout=timeout)
#print(upload.text)
upload.raise_for_status()
self.fileUpRecord.append((fid, upload.json()))
upmd5 = upload.json()['data']['files'][0]['dataFile']['checksum']['value']
#Dataverse hash type
_type = upload.json()['data']['files'][0]['dataFile']['checksum']['type']
if _type.lower() != hashtype.lower():
comparator = self._check_md5(upfile, _type.lower())
else:
comparator = digest
#if hashtype.lower () != 'md5':
# #get an md5 because dataverse uses md5s. Or most of them do anyway.
# #One day this will be rewritten properly.
# md5 = self._check_md5(filename, 'md5')
#else:
# md5 = digest
#if md5 and (upmd5 != md5):
if upmd5 != comparator:
try:
raise exceptions.HashError(f'{_type} mismatch:\nlocal: {comparator}\nuploaded: {upmd5}')
except exceptions.HashError as e:
LOGGER.exception(e)
raise
#Make damn sure that the study isn't locked because of
#tab file processing
##SPSS files still process despite spoofing MIME and extension
##so there's also a forcible unlock check
#fid = upload.json()['data']['files'][0]['dataFile']['id']
#fid not required for unlock
#self.force_notab_unlock(studyId, dest, fid)
if force_unlock:
self.force_notab_unlock(studyId, dest)
else:
count = 0
wait = True
while wait:
wait = self.file_lock_check(studyId, dest, count=count)
if wait:
time.sleep(15) # Don't hit it too often
count += 1
return (fid, upload.json())
except Exception as e:
LOGGER.exception(e)
try:
reason = upload.json()['message']
LOGGER.warning(upload.json())
return (fid, {'status' : f'Failure: {reason}'})
except Exception as e:
LOGGER.warning('Further exceptions!')
LOGGER.exception(e)
LOGGER.warning(upload.text)
return (fid, {'status' : f'Failure: Reason {upload.reason}'})
upload_files(files=None, pid=None, fprefix=None, force_unlock=False)
¶
Uploads multiple files to study with persistentId pid. Returns a list of the original tuples plus JSON responses.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def upload_files(self, files=None, pid=None, fprefix=None, force_unlock=False):
'''
Uploads multiple files to study with persistentId pid.
Returns a list of the original tuples plus JSON responses.
Parameters
----------
files : list
List contains tuples with
(dryadDownloadURL, filename, mimetype, size).
pid : str
Defaults to self.dvpid, which is generated by calling
dryad2dataverse.transfer.Transfer.upload_study().
fprefix : str
File location prefix.
Defaults to dryad2dataverse.constants.TMP
force_unlock : bool
Attempt forcible unlock instead of waiting for tabular
file processing.
Defaults to False.
The Dataverse `/locks` endpoint blocks POST and DELETE requests
from non-superusers (undocumented as of 31 March 2021).
**Forcible unlock requires a superuser API key.**
'''
if not files:
files = self.files
if not fprefix:
fprefix = constants.TMP
out = []
for f in files:
#out.append(self.upload_file(f[0], f[1], f[2], f[3],
# f[4], f[5], pid, fprefix=fprefix))
#out.append(self.upload_file(*[x for x in f],
#last item in files is not necessary
out.append(self.upload_file(*list(f)[:-1],
studyId=pid, fprefix=fprefix,
force_unlock=force_unlock))
return out
upload_json(studyId=None, dest=None)
¶
Uploads Dryad json as a separate file for archival purposes.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/transfer.py
def upload_json(self, studyId=None, dest=None):
'''
Uploads Dryad json as a separate file for archival purposes.
Parameters
----------
studyId : str
Dataverse persistent identifier.
Default dryad2dataverse.transfer.Transfer.dvpid,
which is only generated on
dryad2dataverse.transfer.Transfer.upload_study()
dest : str
Base URL for transfer.
Default dryad2datavese.constants.DVURL
'''
if not studyId:
studyId = self.dvpid
if not dest:
dest = constants.DVURL
if not self.jsonFlag:
url = dest + '/api/datasets/:persistentId/add'
pack = io.StringIO(json.dumps(self.dryad.dryadJson))
desc = {'description':'Original JSON from Dryad',
'categories':['Documentation', 'Code']}
fname = self.doi[self.doi.rfind('/')+1:].replace('.', '_')
payload = {'file': (f'{fname}.json', pack, 'text/plain;charset=UTF-8'),
'jsonData':f'{desc}'}
params = {'persistentId':studyId}
try:
meta = self.session.post(f'{url}',
params=params,
headers=self.auth,
files=payload)
#0 because no dryad fid will be zero
meta.raise_for_status()
self.fileUpRecord.append((0, meta.json()))
self.jsonFlag = (0, meta.json())
LOGGER.debug('Successfully uploaded Dryad JSON to %s', studyId)
#JSON uploads randomly fail with a Dataverse server.log error of
#"A system exception occurred during an invocation on EJB . . ."
#Not reproducible, so errors will only be written to the log.
#Jesus.
except (requests.exceptions.HTTPError,
requests.exceptions.ConnectionError) as err:
LOGGER.error('Unable to upload Dryad JSON to %s', studyId)
LOGGER.error('ERROR message: %s', meta.text)
LOGGER.exception(err)
#And further checking as to what is happening
self.fileUpRecord.append((0, {'status':'Failure: Unable to upload Dryad JSON'}))
if not isinstance(self.dryad.dryadJson, dict):
LOGGER.error('Dryad JSON is not a dictionary')
except Exception as err:
LOGGER.error('Unable to upload Dryad JSON')
LOGGER.exception(err)
upload_study(url=None, apikey=None, timeout=45, **kwargs)
¶
Uploads Dryad study metadata to target Dataverse or updates existing.
Supplying a targetDv kwarg creates a new study and supplying a
dvpid kwarg updates a currently existing Dataverse study.
| Parameters: |
|
|---|
|
Notes
One of targetDv or dvpid is required.
Source code in src/dryad2dataverse/transfer.py
def upload_study(self, url=None, apikey=None, timeout=45, **kwargs):
'''
Uploads Dryad study metadata to target Dataverse or updates existing.
Supplying a `targetDv` kwarg creates a new study and supplying a
`dvpid` kwarg updates a currently existing Dataverse study.
Parameters
----------
url : str
URL of Dataverse instance. Defaults to constants.DVURL.
apikey : str
API key of user. Defaults to contants.APIKEY.
timeout : int
timeout on POST request.
kwargs : dict
Other parameters
----------------
targetDv : str
Short name of target dataverse. Required if new dataset.
Specify as targetDV=value.
dvpid : str
Dataverse persistent ID (for updating metadata).
This is not required for new uploads, specify as dvpid=value
Notes
-----
One of targetDv or dvpid is required.
'''
if not url:
url = constants.DVURL
if not apikey:
apikey = constants.APIKEY
headers = {'X-Dataverse-key' : apikey}
headers.update(USER_AGENT)
targetDv = kwargs.get('targetDv')
dvpid = kwargs.get('dvpid')
#dryFid = kwargs.get('dryFid') #Why did I put this here?
if not targetDv and not dvpid:
try:
raise exceptions.NoTargetError('You must supply one of targetDv \
(target dataverse) \
or dvpid (Dataverse persistent ID)')
except exceptions.NoTargetError as e:
LOGGER.error('No target dataverse or dvpid supplied')
LOGGER.exception(e)
raise
if targetDv and dvpid:
try:
raise ValueError('Supply only one of targetDv or dvpid')
except ValueError as e:
LOGGER.exception(e)
raise
if not dvpid:
endpoint = f'{url}/api/dataverses/{targetDv}/datasets'
upload = self.session.post(endpoint,
headers=headers,
json=self.dryad.dvJson,
timeout=timeout)
LOGGER.debug(upload.text)
else:
endpoint = f'{url}/api/datasets/:persistentId/versions/:draft'
params = {'persistentId':dvpid}
#Yes, dataverse uses *different* json for edits
upload = self.session.put(endpoint, params=params,
headers=headers,
json=self.dryad.dvJson['datasetVersion'],
timeout=timeout)
#self._dvrecord = upload.json()
LOGGER.debug(upload.text)
try:
updata = upload.json()
self.dvStudy = updata
if updata.get('status') != 'OK':
try:
raise exceptions.DataverseUploadError(('Status return is not OK.'
f'{upload.status_code}: '
f'{upload.reason}. '
f'{upload.request.url} '
f'{upload.text}'))
except exceptions.DataverseUploadError as e:
LOGGER.exception(e)
LOGGER.exception(traceback.format_exc())
raise exceptions.DataverseUploadError(('Status return is not OK.'
f'{upload.status_code}: '
f'{upload.reason}. '
f'{upload.request.url} '
f'{upload.text}'))
upload.raise_for_status()
except Exception as e: # Only accessible via non-requests exception
LOGGER.exception(e)
LOGGER.exception(traceback.format_exc())
raise
if targetDv:
self.dryad.dvpid = updata['data'].get('persistentId')
if dvpid:
self.dryad.dvpid = updata['data'].get('datasetPersistentId')
return self.dvpid
dryad2dataverse.monitor
¶
Dryad/Dataverse status tracker. Monitor creates a singleton object which writes to a SQLite database. Methods will (generally) take either a dryad2dataverse.serializer.Serializer instance or dryad2dataverse.transfer.Transfer instance
The monitor’s primary function is to allow for state checking for Dryad studies so that files and studies aren’t downloaded unneccessarily.
Monitor
¶
The Monitor object is a tracker and database updater, so that Dryad files can be monitored and updated over time. Monitor is a singleton, but is not thread-safe.
Source code in src/dryad2dataverse/monitor.py
class Monitor():
'''
The Monitor object is a tracker and database updater, so that
Dryad files can be monitored and updated over time. Monitor is a singleton,
but is not thread-safe.
'''
__instance = None
def __new__(cls, dbase=None, *args, **kwargs):
'''
Creates a new singleton instance of Monitor.
Also creates a database if existing database is not present.
Parameters
----------
dbase : str
Path to sqlite3 database. That is:
/path/to/file.sqlite3
*args : list
**kwargs : dict
'''
if cls.__instance is None:
cls.__instance = super(Monitor, cls).__new__(cls)
cls.__instance.__initialized = False
cls.dbase = dbase
if not cls.dbase:
cls.dbase = constants.DBASE
cls.conn = sqlite3.Connection(cls.dbase)
cls.cursor = cls.conn.cursor()
create = ['CREATE TABLE IF NOT EXISTS dryadStudy \
(uid INTEGER PRIMARY KEY AUTOINCREMENT, \
doi TEXT, lastmoddate TEXT, dryadjson TEXT, \
dvjson TEXT);',
'CREATE TABLE IF NOT EXISTS dryadFiles \
(dryaduid INTEGER REFERENCES dryadStudy (uid), \
dryfilesjson TEXT);',
'CREATE TABLE IF NOT EXISTS dvStudy \
(dryaduid INTEGER references dryadStudy (uid), \
dvpid TEXT);',
'CREATE TABLE IF NOT EXISTS dvFiles \
(dryaduid INTEGER references dryadStudy (uid), \
dryfid INT, \
drymd5 TEXT, dvfid TEXT, dvmd5 TEXT, \
dvfilejson TEXT);',
'CREATE TABLE IF NOT EXISTS lastcheck \
(checkdate TEXT);',
'CREATE TABLE IF NOT EXISTS failed_uploads \
(dryaduid INTEGER references dryadstudy (uid), \
dryfid INT, status TEXT);'
]
for line in create:
cls.cursor.execute(line)
cls.conn.commit()
LOGGER.info('Using database %s', cls.dbase)
return cls.__instance
def __init__(self, dbase=None, *args, **kwargs):
# remove args and kwargs when you find out how init interacts with new.
'''
Initialize the Monitor instance if not instantiated already (ie, Monitor
is a singleton).
Parameters
----------
dbase : str, default=dryad2datverse.constants.DBASE
Complete path to desired location of tracking database
(eg: /tmp/test.db).
*args : list
**kwargs : dict
'''
if self.__initialized:
return
self.__initialized = True
if not dbase:
self.dbase = constants.DBASE
else:
self.dbase = dbase
def __del__(self):
'''
Commits all database transactions on object deletion and closes database.
'''
self.conn.commit()
self.conn.close()
@property
def lastmod(self):
'''
Returns last modification date from monitor.dbase.
'''
self.cursor.execute('SELECT checkdate FROM lastcheck ORDER BY rowid DESC;')
last_mod = self.cursor.fetchall()
if last_mod:
return last_mod[0][0]
return None
def status(self, serial)->dict:
'''
Returns a dictionary with keys 'status' and 'dvpid' and 'notes'.
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
Returns
-------
`{status :'updated', 'dvpid':'doi://some/ident'}`.
Notes
------
`status` is one of 'new', 'identical', 'lastmodsame',
'updated'
'new' is a completely new file.
'identical' The metadata from Dryad is *identical* to the last time
the check was run.
'lastmodsame' Dryad lastModificationDate == last modification date
in database AND output JSON is different.
This can indicate a Dryad
API output change, reindexing or something else.
But the lastModificationDate
is supposed to be an indicator of meaningful change, so this option
exists so you can decide what to do given this option
'updated' Indicates changes to lastModificationDate
Note that Dryad constantly changes their API output, so the changes
may not actually be meaningful.
`dvpid` is a Dataverse persistent identifier.
`None` in the case of status='new'
`notes`: value of Dryad versionChanges field. One of `files_changed` or
`metatdata_changed`. Non-null value present only when status is
not `new` or `identical`. Note that Dryad has no way to indicate *both*
a file and metadata change, so this value reflects only the *last* change
in the Dryad state.
'''
# Last mod date is indicator of change.
# From email w/Ryan Scherle 10 Nov 2020
#The versionNumber updates for either a metadata change or a
#file change. Although we save all of these changes internally, our web
#interface only displays the versions that have file changes, along
#with the most recent metadata. So a dataset that has only two versions
#of files listed on the web may actually have several more versions in
#the API.
#
#If your only need is to track when there are changes to a
#dataset, you may want to use the `lastModificationDate`, which we have
#recently added to our metadata.
#
#Note that the Dryad API output ISN'T STABLE; they add fields etc.
#This means that a comparison of JSON may yield differences even though
#metadata is technically "the same". Just comparing two dicts doesn't cut
#it.
#############################
## Note: by inspection, Dryad outputs JSON that is different
## EVEN IF lastModificationDate is unchanged. (14 January 2022)
## So now what?
#############################
doi = serial.dryadJson['identifier']
self.cursor.execute('SELECT * FROM dryadStudy WHERE doi = ?',
(doi,))
result = self.cursor.fetchall()
if not result:
return {'status': 'new', 'dvpid': None, 'notes': ''}
# dvjson = json.loads(result[-1][4])
# Check the fresh vs. updated jsons for the keys
try:
dryaduid = result[-1][0]
self.cursor.execute('SELECT dvpid from dvStudy WHERE \
dryaduid = ?', (dryaduid,))
dvpid = self.cursor.fetchall()[-1][0]
serial.dvpid = dvpid
except TypeError:
try:
raise exceptions.DatabaseError
except exceptions.DatabaseError as e:
LOGGER.error('Dryad DOI : %s. Error finding Dataverse PID', doi)
LOGGER.exception(e)
raise
newfile = copy.deepcopy(serial.dryadJson)
testfile = copy.deepcopy(json.loads(result[-1][3]))
if newfile == testfile:
return {'status': 'identical', 'dvpid': dvpid, 'notes': ''}
if newfile['lastModificationDate'] != testfile['lastModificationDate']:
return {'status': 'updated', 'dvpid': dvpid,
'notes': newfile['versionChanges']}
return {'status': 'lastmodsame', 'dvpid': dvpid,
'notes': newfile.get('versionChanges')}
def diff_metadata(self, serial):
'''
Analyzes differences in metadata between current serializer
instance and last updated serializer instance.
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
Returns
-------
Returns a list of field changes consisting of:
[{key: (old_value, new_value}] or None if no changes.
Notes
-----
For example:
```
[{'title':
('Cascading effects of algal warming in a freshwater community',
'Cascading effects of algal warming in a freshwater community theatre')}
]
```
'''
if self.status(serial)['status'] == 'updated':
self.cursor.execute('SELECT dryadjson from dryadStudy \
WHERE doi = ?',
(serial.dryadJson['identifier'],))
oldJson = json.loads(self.cursor.fetchall()[-1][0])
out = []
for k in serial.dryadJson:
if serial.dryadJson[k] != oldJson.get(k):
out.append({k: (oldJson.get(k), serial.dryadJson[k])})
return out
return None
@staticmethod
def __added_hashes(oldFiles, newFiles):
'''
Checks that two objects in dryad2dataverse.serializer.files format
stripped of digestType and digest values are identical. Returns array
of files with changed hash.
Assumes name, mimeType, size, descr all unchanged, which is not
necessarily a valid assumption
Parameters
----------
oldFiles : Union[list, tuple]
(name, mimeType, size, descr, digestType, digest)
newFiles : Union[list, tuple]
(name, mimeType, size, descr, digestType, digest)
'''
hash_change = []
old = [x[1:-2] for x in oldFiles]
#URLs are not permanent
old_no_url = [x[1:] for x in oldFiles]
for fi in newFiles:
if fi[1:-2] in old and fi[1:] not in old_no_url:
hash_change.append(fi)
return hash_change
def diff_files(self, serial):
'''
Returns a dict with additions and deletions from previous Dryad
to dataverse upload.
Because checksums are not necessarily included in Dryad file
metadata, this method uses dryad file IDs, size, or
whatever is available.
If dryad2dataverse.monitor.Monitor.status()
indicates a change it will produce dictionary output with a list
of additions, deletions or hash changes (ie, identical
except for hash changes), as below:
`{'add':[dyadfiletuples], 'delete:[dryadfiletuples],
'hash_change': [dryadfiletuples]}`
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
'''
diffReport = {}
if self.status(serial)['status'] == 'new':
#do we want to show what needs to be added?
return {'add': serial.files}
#return {}
self.cursor.execute('SELECT uid from dryadStudy WHERE doi = ?',
(serial.doi,))
mostRecent = self.cursor.fetchall()[-1][0]
self.cursor.execute('SELECT dryfilesjson from dryadFiles WHERE \
dryaduid = ?', (mostRecent,))
oldFileList = self.cursor.fetchall()[-1][0]
if not oldFileList:
oldFileList = []
else:
out = []
#With Dryad API change, files are paginated
#now stored as list
for old in json.loads(oldFileList):
#for old in oldFileList:
oldFiles = old['_embedded'].get('stash:files')
# comparing file tuples from dryad2dataverse.serializer.
# Maybe JSON is better?
# because of code duplication below.
for f in oldFiles:
#Download links are not persistent. Be warned
try:
downLink = f['_links']['stash:file-download']['href']
except KeyError:
downLink = f['_links']['stash:download']['href']
downLink = f'{constants.DRYURL}{downLink}'
name = f['path']
mimeType = f['mimeType']
size = f['size']
descr = f.get('description', '')
digestType = f.get('digestType', '')
digest = f.get('digest', '')
out.append((downLink, name, mimeType, size, descr, digestType, digest))
oldFiles = out
newFiles = serial.files[:]
# Tests go here
#Check for identity first
#if returned here there are definitely no changes
if (set(oldFiles).issuperset(set(newFiles)) and
set(newFiles).issuperset(oldFiles)):
return diffReport
#filenames for checking hash changes.
#Can't use URL or hashes for comparisons because they can change
#without warning, despite the fact that the API says that
#file IDs are unique. They aren't. Verified by Ryan Scherle at
#Dryad December 2021
old_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(oldFiles)}
new_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(newFiles)}
old_no_hash = [old_map[x]['no_hash'] for x in old_map]
new_no_hash = [new_map[x]['no_hash'] for x in new_map]
#check for added hash only
hash_change = Monitor.__added_hashes(oldFiles, newFiles)
must = set(old_no_hash).issuperset(set(new_no_hash))
if not must:
needsadd = set(new_no_hash) - (set(old_no_hash) & set(new_no_hash))
#Use the map created above to return the full file info
diffReport.update({'add': [new_map[new_no_hash.index(x)]['orig']
for x in needsadd]})
must = set(new_no_hash).issuperset(old_no_hash)
if not must:
needsdel = set(old_no_hash) - (set(new_no_hash) & set(old_no_hash))
diffReport.update({'delete' : [old_map[old_no_hash.index(x)]['orig']
for x in needsdel]})
if hash_change:
diffReport.update({'hash_change': hash_change})
return diffReport
def get_dv_fid(self, url):
'''
Returns str — the Dataverse file ID from parsing a Dryad
file download link. Normally used for determining dataverse
file ids for *deletion* in case of dryad file changes.
Parameters
----------
url : str
*Dryad* file URL in form of
'https://datadryad.org/api/v2/files/385819/download'.
'''
fid = url[url.rfind('/', 0, -10)+1:].strip('/download')
try:
fid = int(fid)
except ValueError as e:
LOGGER.error('File ID %s is not an integer', fid)
LOGGER.exception(e)
raise
#File IDs are *CHANGEABLE* according to Dryad, Dec 2021
#SQLite default returns are by ROWID ASC, so the last record
#returned should still be the correct, ie. most recent, one.
#However, just in case, this is now done explicitly.
self.cursor.execute('SELECT dvfid, ROWID FROM dvFiles WHERE \
dryfid = ? ORDER BY ROWID ASC;', (fid,))
dvfid = self.cursor.fetchall()
if dvfid:
return dvfid[-1][0]
return None
def get_dv_fids(self, filelist):
'''
Returns Dataverse file IDs from a list of Dryad file tuples.
Generally, you would use the output from
dryad2dataverse.monitor.Monitor.diff_files['delete']
to discover Dataverse file ids for deletion.
Parameters
----------
filelist : list
List of Dryad file tuples: eg:
```
[('https://datadryad.org/api/v2/files/385819/download',
'GCB_ACG_Mortality_2020.zip',
'application/x-zip-compressed', 23787587),
('https://datadryad.org/api/v2/files/385820/download',
'Readme_ACG_Mortality.txt',
'text/plain', 1350)]
```
'''
fids = []
for f in filelist:
fids.append(self.get_dv_fid(f[0]))
return fids
# return [self.get_dv_fid(f[0]) for f in filelist]
def get_json_dvfids(self, serial)->list:
'''
Return a list of Dataverse file ids for Dryad JSONs which were
uploaded to Dataverse.
Normally used to discover the file IDs to remove Dryad JSONs
which have changed.
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
Returns
-------
list
'''
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi=?',
(serial.doi,))
try:
uid = self.cursor.fetchone()[0]
self.cursor.execute('SELECT dvfid FROM dvFiles WHERE \
dryaduid = ? AND dryfid=?', (uid, 0))
jsonfid = [f[0] for f in self.cursor.fetchall()]
return jsonfid
except TypeError:
return []
def update(self, transfer):
'''
Updates the Monitor database with information from a
dryad2dataverse.transfer.Transfer instance.
If a Dryad primary metadata record has changes, it will be
deleted from the database.
This method should be called after all transfers are completed,
including Dryad JSON updates, as the last action for transfer.
Parameters
----------
transfer : dryad2dataverse.transfer.Transfer
'''
# get the pre-update dryad uid in case we need it.
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi = ?',
(transfer.dryad.dryadJson['identifier'],))
olduid = self.cursor.fetchone()[0]
if olduid:
olduid = int(olduid)
if self.status(transfer.dryad)['status'] != 'unchanged':
doi = transfer.doi
lastmod = transfer.dryad.dryadJson.get('lastModificationDate')
dryadJson = json.dumps(transfer.dryad.dryadJson)
dvJson = json.dumps(transfer.dvStudy)
# Update study metadata
self.cursor.execute('INSERT INTO dryadStudy \
(doi, lastmoddate, dryadjson, dvjson) \
VALUES (?, ?, ?, ?)',
(doi, lastmod, dryadJson, dvJson))
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE \
doi = ?', (doi,))
dryaduid = self.cursor.fetchone()[0]
#if type(dryaduid) != int:
if not isinstance(dryaduid, int):
try:
raise TypeError('Dryad UID is not an integer')
except TypeError as e:
LOGGER.error(e)
raise
# Update dryad file json
self.cursor.execute('INSERT INTO dryadFiles VALUES (?, ?)',
(dryaduid,
json.dumps(transfer.dryad.fileJson)))
# Update dataverse study map
self.cursor.execute('SELECT dvpid FROM dvStudy WHERE \
dvpid = ?', (transfer.dryad.dvpid,))
if not self.cursor.fetchone():
self.cursor.execute('INSERT INTO dvStudy VALUES (?, ?)',
(dryaduid, transfer.dryad.dvpid))
else:
self.cursor.execute('UPDATE dvStudy SET dryaduid=?, \
dvpid=? WHERE dvpid =?',
(dryaduid, transfer.dryad.dvpid,
transfer.dryad.dvpid))
# Update the files table
# Because we want to have a *complete* file list for each
# dryaduid, we have to copy any existing old files,
# then add and delete.
if olduid:
self.cursor.execute('SELECT * FROM dvFiles WHERE \
dryaduid=?', (olduid,))
inserter = self.cursor.fetchall()
for rec in inserter:
# TODONE FIX THIS #I think it's fixed 11 Feb 21
self.cursor.execute('INSERT INTO dvFiles VALUES \
(?, ?, ?, ?, ?, ?)',
(dryaduid, rec[1], rec[2],
rec[3], rec[4], rec[5]))
# insert newly uploaded files
for rec in transfer.fileUpRecord:
try:
dvfid = rec[1]['data']['files'][0]['dataFile']['id']
# Screw you for burying the file ID this deep
recMd5 = rec[1]['data']['files'][0]['dataFile']['checksum']['value']
except (KeyError, IndexError) as err:
#write to failed uploads table instead
status = rec[1].get('status')
if not status:
LOGGER.error('JSON read error for Dryad file ID %s', rec[0])
LOGGER.error('File %s for DOI %s may not be uploaded', rec[0], transfer.doi)
LOGGER.exception(err)
msg = {'status': 'Failure: Other non-specific '
'failure. Check logs'}
self.cursor.execute('INSERT INTO failed_uploads VALUES \
(?, ?, ?);', (dryaduid, rec[0], json.dumps(msg)))
continue
self.cursor.execute('INSERT INTO failed_uploads VALUES \
(?, ?, ?);', (dryaduid, rec[0], json.dumps(rec[1])))
LOGGER.warning(type(err))
LOGGER.warning('%s. DOI %s, File ID %s',
rec[1].get('status'),
transfer.doi, rec[0])
continue
# md5s verified during upload step, so they should
# match already
self.cursor.execute('INSERT INTO dvFiles VALUES \
(?, ?, ?, ?, ?, ?)',
(dryaduid, rec[0], recMd5,
dvfid, recMd5, json.dumps(rec[1])))
# Now the deleted files
for rec in transfer.fileDelRecord:
# fileDelRecord consists only of [fid,fid2, ...]
# Dryad record ID is int not str
self.cursor.execute('DELETE FROM dvFiles WHERE dvfid=? \
AND dryaduid=?',
(int(rec), dryaduid))
LOGGER.debug('deleted dryfid = %s, dryaduid = %s', rec, dryaduid)
# And lastly, any JSON metadata updates:
# NOW WHAT?
# JSON has dryfid==0
self.cursor.execute('SELECT * FROM dvfiles WHERE \
dryfid=? and dryaduid=?',
(0, dryaduid))
try:
exists = self.cursor.fetchone()[0]
# Old metadata must be deleted on a change.
if exists:
shouldDel = self.status(transfer.dryad)['status']
if shouldDel == 'updated':
self.cursor.execute('DELETE FROM dvfiles WHERE \
dryfid=? and dryaduid=?',
(0, dryaduid))
except TypeError:
pass
if transfer.jsonFlag:
# update dryad JSON
djson5 = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['checksum']['value']
dfid = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['id']
self.cursor.execute('INSERT INTO dvfiles VALUES \
(?, ?, ?, ?, ?, ?)',
(dryaduid, 0, djson5, dfid,
djson5, json.dumps(transfer.jsonFlag[1])))
self.conn.commit()
def set_timestamp(self, curdate=None):
'''
Adds current time to the database table. Can be queried and be used
for subsequent checking for updates. To query last modification time,
use the dataverse2dryad.monitor.Monitor.lastmod attribute.
Parameters
----------
curdate : str
UTC datetime string in the format suitable for the Dryad API.
eg. 2021-01-21T21:42:40Z
or .strftime('%Y-%m-%dT%H:%M:%SZ').
'''
#Dryad API uses Zulu time
if not curdate:
curdate = datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
self.cursor.execute('INSERT INTO lastcheck VALUES (?)',
(curdate,))
self.conn.commit()
lastmod
property
¶
Returns last modification date from monitor.dbase.
__del__()
¶
Commits all database transactions on object deletion and closes database.
Source code in src/dryad2dataverse/monitor.py
def __del__(self):
'''
Commits all database transactions on object deletion and closes database.
'''
self.conn.commit()
self.conn.close()
__init__(dbase=None, *args, **kwargs)
¶
Initialize the Monitor instance if not instantiated already (ie, Monitor is a singleton).
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def __init__(self, dbase=None, *args, **kwargs):
# remove args and kwargs when you find out how init interacts with new.
'''
Initialize the Monitor instance if not instantiated already (ie, Monitor
is a singleton).
Parameters
----------
dbase : str, default=dryad2datverse.constants.DBASE
Complete path to desired location of tracking database
(eg: /tmp/test.db).
*args : list
**kwargs : dict
'''
if self.__initialized:
return
self.__initialized = True
if not dbase:
self.dbase = constants.DBASE
else:
self.dbase = dbase
__new__(dbase=None, *args, **kwargs)
¶
Creates a new singleton instance of Monitor.
Also creates a database if existing database is not present.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def __new__(cls, dbase=None, *args, **kwargs):
'''
Creates a new singleton instance of Monitor.
Also creates a database if existing database is not present.
Parameters
----------
dbase : str
Path to sqlite3 database. That is:
/path/to/file.sqlite3
*args : list
**kwargs : dict
'''
if cls.__instance is None:
cls.__instance = super(Monitor, cls).__new__(cls)
cls.__instance.__initialized = False
cls.dbase = dbase
if not cls.dbase:
cls.dbase = constants.DBASE
cls.conn = sqlite3.Connection(cls.dbase)
cls.cursor = cls.conn.cursor()
create = ['CREATE TABLE IF NOT EXISTS dryadStudy \
(uid INTEGER PRIMARY KEY AUTOINCREMENT, \
doi TEXT, lastmoddate TEXT, dryadjson TEXT, \
dvjson TEXT);',
'CREATE TABLE IF NOT EXISTS dryadFiles \
(dryaduid INTEGER REFERENCES dryadStudy (uid), \
dryfilesjson TEXT);',
'CREATE TABLE IF NOT EXISTS dvStudy \
(dryaduid INTEGER references dryadStudy (uid), \
dvpid TEXT);',
'CREATE TABLE IF NOT EXISTS dvFiles \
(dryaduid INTEGER references dryadStudy (uid), \
dryfid INT, \
drymd5 TEXT, dvfid TEXT, dvmd5 TEXT, \
dvfilejson TEXT);',
'CREATE TABLE IF NOT EXISTS lastcheck \
(checkdate TEXT);',
'CREATE TABLE IF NOT EXISTS failed_uploads \
(dryaduid INTEGER references dryadstudy (uid), \
dryfid INT, status TEXT);'
]
for line in create:
cls.cursor.execute(line)
cls.conn.commit()
LOGGER.info('Using database %s', cls.dbase)
return cls.__instance
diff_files(serial)
¶
Returns a dict with additions and deletions from previous Dryad to dataverse upload.
Because checksums are not necessarily included in Dryad file metadata, this method uses dryad file IDs, size, or whatever is available.
If dryad2dataverse.monitor.Monitor.status() indicates a change it will produce dictionary output with a list of additions, deletions or hash changes (ie, identical except for hash changes), as below:
{'add':[dyadfiletuples], 'delete:[dryadfiletuples],
'hash_change': [dryadfiletuples]}
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def diff_files(self, serial):
'''
Returns a dict with additions and deletions from previous Dryad
to dataverse upload.
Because checksums are not necessarily included in Dryad file
metadata, this method uses dryad file IDs, size, or
whatever is available.
If dryad2dataverse.monitor.Monitor.status()
indicates a change it will produce dictionary output with a list
of additions, deletions or hash changes (ie, identical
except for hash changes), as below:
`{'add':[dyadfiletuples], 'delete:[dryadfiletuples],
'hash_change': [dryadfiletuples]}`
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
'''
diffReport = {}
if self.status(serial)['status'] == 'new':
#do we want to show what needs to be added?
return {'add': serial.files}
#return {}
self.cursor.execute('SELECT uid from dryadStudy WHERE doi = ?',
(serial.doi,))
mostRecent = self.cursor.fetchall()[-1][0]
self.cursor.execute('SELECT dryfilesjson from dryadFiles WHERE \
dryaduid = ?', (mostRecent,))
oldFileList = self.cursor.fetchall()[-1][0]
if not oldFileList:
oldFileList = []
else:
out = []
#With Dryad API change, files are paginated
#now stored as list
for old in json.loads(oldFileList):
#for old in oldFileList:
oldFiles = old['_embedded'].get('stash:files')
# comparing file tuples from dryad2dataverse.serializer.
# Maybe JSON is better?
# because of code duplication below.
for f in oldFiles:
#Download links are not persistent. Be warned
try:
downLink = f['_links']['stash:file-download']['href']
except KeyError:
downLink = f['_links']['stash:download']['href']
downLink = f'{constants.DRYURL}{downLink}'
name = f['path']
mimeType = f['mimeType']
size = f['size']
descr = f.get('description', '')
digestType = f.get('digestType', '')
digest = f.get('digest', '')
out.append((downLink, name, mimeType, size, descr, digestType, digest))
oldFiles = out
newFiles = serial.files[:]
# Tests go here
#Check for identity first
#if returned here there are definitely no changes
if (set(oldFiles).issuperset(set(newFiles)) and
set(newFiles).issuperset(oldFiles)):
return diffReport
#filenames for checking hash changes.
#Can't use URL or hashes for comparisons because they can change
#without warning, despite the fact that the API says that
#file IDs are unique. They aren't. Verified by Ryan Scherle at
#Dryad December 2021
old_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(oldFiles)}
new_map = {x:{'orig':y, 'no_hash':y[1:4]} for x,y in enumerate(newFiles)}
old_no_hash = [old_map[x]['no_hash'] for x in old_map]
new_no_hash = [new_map[x]['no_hash'] for x in new_map]
#check for added hash only
hash_change = Monitor.__added_hashes(oldFiles, newFiles)
must = set(old_no_hash).issuperset(set(new_no_hash))
if not must:
needsadd = set(new_no_hash) - (set(old_no_hash) & set(new_no_hash))
#Use the map created above to return the full file info
diffReport.update({'add': [new_map[new_no_hash.index(x)]['orig']
for x in needsadd]})
must = set(new_no_hash).issuperset(old_no_hash)
if not must:
needsdel = set(old_no_hash) - (set(new_no_hash) & set(old_no_hash))
diffReport.update({'delete' : [old_map[old_no_hash.index(x)]['orig']
for x in needsdel]})
if hash_change:
diffReport.update({'hash_change': hash_change})
return diffReport
diff_metadata(serial)
¶
Analyzes differences in metadata between current serializer instance and last updated serializer instance.
| Parameters: |
|
|---|
| Returns: |
|
|---|
Notes
For example:
[{'title':
('Cascading effects of algal warming in a freshwater community',
'Cascading effects of algal warming in a freshwater community theatre')}
]
Source code in src/dryad2dataverse/monitor.py
def diff_metadata(self, serial):
'''
Analyzes differences in metadata between current serializer
instance and last updated serializer instance.
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
Returns
-------
Returns a list of field changes consisting of:
[{key: (old_value, new_value}] or None if no changes.
Notes
-----
For example:
```
[{'title':
('Cascading effects of algal warming in a freshwater community',
'Cascading effects of algal warming in a freshwater community theatre')}
]
```
'''
if self.status(serial)['status'] == 'updated':
self.cursor.execute('SELECT dryadjson from dryadStudy \
WHERE doi = ?',
(serial.dryadJson['identifier'],))
oldJson = json.loads(self.cursor.fetchall()[-1][0])
out = []
for k in serial.dryadJson:
if serial.dryadJson[k] != oldJson.get(k):
out.append({k: (oldJson.get(k), serial.dryadJson[k])})
return out
return None
get_dv_fid(url)
¶
Returns str — the Dataverse file ID from parsing a Dryad file download link. Normally used for determining dataverse file ids for deletion in case of dryad file changes.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def get_dv_fid(self, url):
'''
Returns str — the Dataverse file ID from parsing a Dryad
file download link. Normally used for determining dataverse
file ids for *deletion* in case of dryad file changes.
Parameters
----------
url : str
*Dryad* file URL in form of
'https://datadryad.org/api/v2/files/385819/download'.
'''
fid = url[url.rfind('/', 0, -10)+1:].strip('/download')
try:
fid = int(fid)
except ValueError as e:
LOGGER.error('File ID %s is not an integer', fid)
LOGGER.exception(e)
raise
#File IDs are *CHANGEABLE* according to Dryad, Dec 2021
#SQLite default returns are by ROWID ASC, so the last record
#returned should still be the correct, ie. most recent, one.
#However, just in case, this is now done explicitly.
self.cursor.execute('SELECT dvfid, ROWID FROM dvFiles WHERE \
dryfid = ? ORDER BY ROWID ASC;', (fid,))
dvfid = self.cursor.fetchall()
if dvfid:
return dvfid[-1][0]
return None
get_dv_fids(filelist)
¶
Returns Dataverse file IDs from a list of Dryad file tuples. Generally, you would use the output from dryad2dataverse.monitor.Monitor.diff_files[‘delete’] to discover Dataverse file ids for deletion.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def get_dv_fids(self, filelist):
'''
Returns Dataverse file IDs from a list of Dryad file tuples.
Generally, you would use the output from
dryad2dataverse.monitor.Monitor.diff_files['delete']
to discover Dataverse file ids for deletion.
Parameters
----------
filelist : list
List of Dryad file tuples: eg:
```
[('https://datadryad.org/api/v2/files/385819/download',
'GCB_ACG_Mortality_2020.zip',
'application/x-zip-compressed', 23787587),
('https://datadryad.org/api/v2/files/385820/download',
'Readme_ACG_Mortality.txt',
'text/plain', 1350)]
```
'''
fids = []
for f in filelist:
fids.append(self.get_dv_fid(f[0]))
return fids
get_json_dvfids(serial)
¶
Return a list of Dataverse file ids for Dryad JSONs which were uploaded to Dataverse. Normally used to discover the file IDs to remove Dryad JSONs which have changed.
| Parameters: |
|
|---|
| Returns: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def get_json_dvfids(self, serial)->list:
'''
Return a list of Dataverse file ids for Dryad JSONs which were
uploaded to Dataverse.
Normally used to discover the file IDs to remove Dryad JSONs
which have changed.
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
Returns
-------
list
'''
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi=?',
(serial.doi,))
try:
uid = self.cursor.fetchone()[0]
self.cursor.execute('SELECT dvfid FROM dvFiles WHERE \
dryaduid = ? AND dryfid=?', (uid, 0))
jsonfid = [f[0] for f in self.cursor.fetchall()]
return jsonfid
except TypeError:
return []
set_timestamp(curdate=None)
¶
Adds current time to the database table. Can be queried and be used for subsequent checking for updates. To query last modification time, use the dataverse2dryad.monitor.Monitor.lastmod attribute.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def set_timestamp(self, curdate=None):
'''
Adds current time to the database table. Can be queried and be used
for subsequent checking for updates. To query last modification time,
use the dataverse2dryad.monitor.Monitor.lastmod attribute.
Parameters
----------
curdate : str
UTC datetime string in the format suitable for the Dryad API.
eg. 2021-01-21T21:42:40Z
or .strftime('%Y-%m-%dT%H:%M:%SZ').
'''
#Dryad API uses Zulu time
if not curdate:
curdate = datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
self.cursor.execute('INSERT INTO lastcheck VALUES (?)',
(curdate,))
self.conn.commit()
status(serial)
¶
Returns a dictionary with keys ‘status’ and ‘dvpid’ and ‘notes’.
| Parameters: |
|
|---|
| Returns: |
|
|---|
Notes
status is one of ‘new’, ‘identical’, ‘lastmodsame’,
‘updated’
‘new’ is a completely new file.
‘identical’ The metadata from Dryad is identical to the last time the check was run.
‘lastmodsame’ Dryad lastModificationDate == last modification date in database AND output JSON is different. This can indicate a Dryad API output change, reindexing or something else. But the lastModificationDate is supposed to be an indicator of meaningful change, so this option exists so you can decide what to do given this option
‘updated’ Indicates changes to lastModificationDate
Note that Dryad constantly changes their API output, so the changes may not actually be meaningful.
dvpid is a Dataverse persistent identifier.
None in the case of status=’new’
notes: value of Dryad versionChanges field. One of files_changed or
metatdata_changed. Non-null value present only when status is
not new or identical. Note that Dryad has no way to indicate both
a file and metadata change, so this value reflects only the last change
in the Dryad state.
Source code in src/dryad2dataverse/monitor.py
def status(self, serial)->dict:
'''
Returns a dictionary with keys 'status' and 'dvpid' and 'notes'.
Parameters
----------
serial : dryad2dataverse.serializer.Serializer
Returns
-------
`{status :'updated', 'dvpid':'doi://some/ident'}`.
Notes
------
`status` is one of 'new', 'identical', 'lastmodsame',
'updated'
'new' is a completely new file.
'identical' The metadata from Dryad is *identical* to the last time
the check was run.
'lastmodsame' Dryad lastModificationDate == last modification date
in database AND output JSON is different.
This can indicate a Dryad
API output change, reindexing or something else.
But the lastModificationDate
is supposed to be an indicator of meaningful change, so this option
exists so you can decide what to do given this option
'updated' Indicates changes to lastModificationDate
Note that Dryad constantly changes their API output, so the changes
may not actually be meaningful.
`dvpid` is a Dataverse persistent identifier.
`None` in the case of status='new'
`notes`: value of Dryad versionChanges field. One of `files_changed` or
`metatdata_changed`. Non-null value present only when status is
not `new` or `identical`. Note that Dryad has no way to indicate *both*
a file and metadata change, so this value reflects only the *last* change
in the Dryad state.
'''
# Last mod date is indicator of change.
# From email w/Ryan Scherle 10 Nov 2020
#The versionNumber updates for either a metadata change or a
#file change. Although we save all of these changes internally, our web
#interface only displays the versions that have file changes, along
#with the most recent metadata. So a dataset that has only two versions
#of files listed on the web may actually have several more versions in
#the API.
#
#If your only need is to track when there are changes to a
#dataset, you may want to use the `lastModificationDate`, which we have
#recently added to our metadata.
#
#Note that the Dryad API output ISN'T STABLE; they add fields etc.
#This means that a comparison of JSON may yield differences even though
#metadata is technically "the same". Just comparing two dicts doesn't cut
#it.
#############################
## Note: by inspection, Dryad outputs JSON that is different
## EVEN IF lastModificationDate is unchanged. (14 January 2022)
## So now what?
#############################
doi = serial.dryadJson['identifier']
self.cursor.execute('SELECT * FROM dryadStudy WHERE doi = ?',
(doi,))
result = self.cursor.fetchall()
if not result:
return {'status': 'new', 'dvpid': None, 'notes': ''}
# dvjson = json.loads(result[-1][4])
# Check the fresh vs. updated jsons for the keys
try:
dryaduid = result[-1][0]
self.cursor.execute('SELECT dvpid from dvStudy WHERE \
dryaduid = ?', (dryaduid,))
dvpid = self.cursor.fetchall()[-1][0]
serial.dvpid = dvpid
except TypeError:
try:
raise exceptions.DatabaseError
except exceptions.DatabaseError as e:
LOGGER.error('Dryad DOI : %s. Error finding Dataverse PID', doi)
LOGGER.exception(e)
raise
newfile = copy.deepcopy(serial.dryadJson)
testfile = copy.deepcopy(json.loads(result[-1][3]))
if newfile == testfile:
return {'status': 'identical', 'dvpid': dvpid, 'notes': ''}
if newfile['lastModificationDate'] != testfile['lastModificationDate']:
return {'status': 'updated', 'dvpid': dvpid,
'notes': newfile['versionChanges']}
return {'status': 'lastmodsame', 'dvpid': dvpid,
'notes': newfile.get('versionChanges')}
update(transfer)
¶
Updates the Monitor database with information from a dryad2dataverse.transfer.Transfer instance.
If a Dryad primary metadata record has changes, it will be deleted from the database.
This method should be called after all transfers are completed, including Dryad JSON updates, as the last action for transfer.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/monitor.py
def update(self, transfer):
'''
Updates the Monitor database with information from a
dryad2dataverse.transfer.Transfer instance.
If a Dryad primary metadata record has changes, it will be
deleted from the database.
This method should be called after all transfers are completed,
including Dryad JSON updates, as the last action for transfer.
Parameters
----------
transfer : dryad2dataverse.transfer.Transfer
'''
# get the pre-update dryad uid in case we need it.
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi = ?',
(transfer.dryad.dryadJson['identifier'],))
olduid = self.cursor.fetchone()[0]
if olduid:
olduid = int(olduid)
if self.status(transfer.dryad)['status'] != 'unchanged':
doi = transfer.doi
lastmod = transfer.dryad.dryadJson.get('lastModificationDate')
dryadJson = json.dumps(transfer.dryad.dryadJson)
dvJson = json.dumps(transfer.dvStudy)
# Update study metadata
self.cursor.execute('INSERT INTO dryadStudy \
(doi, lastmoddate, dryadjson, dvjson) \
VALUES (?, ?, ?, ?)',
(doi, lastmod, dryadJson, dvJson))
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE \
doi = ?', (doi,))
dryaduid = self.cursor.fetchone()[0]
#if type(dryaduid) != int:
if not isinstance(dryaduid, int):
try:
raise TypeError('Dryad UID is not an integer')
except TypeError as e:
LOGGER.error(e)
raise
# Update dryad file json
self.cursor.execute('INSERT INTO dryadFiles VALUES (?, ?)',
(dryaduid,
json.dumps(transfer.dryad.fileJson)))
# Update dataverse study map
self.cursor.execute('SELECT dvpid FROM dvStudy WHERE \
dvpid = ?', (transfer.dryad.dvpid,))
if not self.cursor.fetchone():
self.cursor.execute('INSERT INTO dvStudy VALUES (?, ?)',
(dryaduid, transfer.dryad.dvpid))
else:
self.cursor.execute('UPDATE dvStudy SET dryaduid=?, \
dvpid=? WHERE dvpid =?',
(dryaduid, transfer.dryad.dvpid,
transfer.dryad.dvpid))
# Update the files table
# Because we want to have a *complete* file list for each
# dryaduid, we have to copy any existing old files,
# then add and delete.
if olduid:
self.cursor.execute('SELECT * FROM dvFiles WHERE \
dryaduid=?', (olduid,))
inserter = self.cursor.fetchall()
for rec in inserter:
# TODONE FIX THIS #I think it's fixed 11 Feb 21
self.cursor.execute('INSERT INTO dvFiles VALUES \
(?, ?, ?, ?, ?, ?)',
(dryaduid, rec[1], rec[2],
rec[3], rec[4], rec[5]))
# insert newly uploaded files
for rec in transfer.fileUpRecord:
try:
dvfid = rec[1]['data']['files'][0]['dataFile']['id']
# Screw you for burying the file ID this deep
recMd5 = rec[1]['data']['files'][0]['dataFile']['checksum']['value']
except (KeyError, IndexError) as err:
#write to failed uploads table instead
status = rec[1].get('status')
if not status:
LOGGER.error('JSON read error for Dryad file ID %s', rec[0])
LOGGER.error('File %s for DOI %s may not be uploaded', rec[0], transfer.doi)
LOGGER.exception(err)
msg = {'status': 'Failure: Other non-specific '
'failure. Check logs'}
self.cursor.execute('INSERT INTO failed_uploads VALUES \
(?, ?, ?);', (dryaduid, rec[0], json.dumps(msg)))
continue
self.cursor.execute('INSERT INTO failed_uploads VALUES \
(?, ?, ?);', (dryaduid, rec[0], json.dumps(rec[1])))
LOGGER.warning(type(err))
LOGGER.warning('%s. DOI %s, File ID %s',
rec[1].get('status'),
transfer.doi, rec[0])
continue
# md5s verified during upload step, so they should
# match already
self.cursor.execute('INSERT INTO dvFiles VALUES \
(?, ?, ?, ?, ?, ?)',
(dryaduid, rec[0], recMd5,
dvfid, recMd5, json.dumps(rec[1])))
# Now the deleted files
for rec in transfer.fileDelRecord:
# fileDelRecord consists only of [fid,fid2, ...]
# Dryad record ID is int not str
self.cursor.execute('DELETE FROM dvFiles WHERE dvfid=? \
AND dryaduid=?',
(int(rec), dryaduid))
LOGGER.debug('deleted dryfid = %s, dryaduid = %s', rec, dryaduid)
# And lastly, any JSON metadata updates:
# NOW WHAT?
# JSON has dryfid==0
self.cursor.execute('SELECT * FROM dvfiles WHERE \
dryfid=? and dryaduid=?',
(0, dryaduid))
try:
exists = self.cursor.fetchone()[0]
# Old metadata must be deleted on a change.
if exists:
shouldDel = self.status(transfer.dryad)['status']
if shouldDel == 'updated':
self.cursor.execute('DELETE FROM dvfiles WHERE \
dryfid=? and dryaduid=?',
(0, dryaduid))
except TypeError:
pass
if transfer.jsonFlag:
# update dryad JSON
djson5 = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['checksum']['value']
dfid = transfer.jsonFlag[1]['data']['files'][0]['dataFile']['id']
self.cursor.execute('INSERT INTO dvfiles VALUES \
(?, ?, ?, ?, ?, ?)',
(dryaduid, 0, djson5, dfid,
djson5, json.dumps(transfer.jsonFlag[1])))
self.conn.commit()
dryad2dataverse.handlers
¶
Custom log handlers for sending log information to recipients.
SSLSMTPHandler
¶
Bases: SMTPHandler
An SSL handler for logging.handlers
Source code in src/dryad2dataverse/handlers.py
class SSLSMTPHandler(SMTPHandler):
'''
An SSL handler for logging.handlers
'''
def emit(self, record:logging.LogRecord):
'''
Emit a record while using an SSL mail server.
Parameters
----------
record : logging.LogRecord
'''
#Praise be to
#https://stackoverflow.com/questions/36937461/
#how-can-i-send-an-email-using-python-loggings-
#smtphandler-and-ssl
try:
port = self.mailport
if not port:
port = smtplib.SMTP_PORT
smtp = smtplib.SMTP_SSL(self.mailhost, port)
msg = self.format(record)
out = EmailMessage()
out['Subject'] = self.getSubject(record)
out['From'] = self.fromaddr
out['To'] = self.toaddrs
out.set_content(msg)
#global rec2
#rec2 = record
if self.username:
smtp.login(self.username, self.password)
#smtp.sendmail(self.fromaddr, self.toaddrs, msg)
#Attempting to send using smtp.sendmail as above
#results in messages with no text, so use
smtp.send_message(out)
smtp.quit()
except (KeyboardInterrupt, SystemExit):
raise
except: # pylint: disable=bare-except
self.handleError(record)
emit(record)
¶
Emit a record while using an SSL mail server.
| Parameters: |
|
|---|
Source code in src/dryad2dataverse/handlers.py
def emit(self, record:logging.LogRecord):
'''
Emit a record while using an SSL mail server.
Parameters
----------
record : logging.LogRecord
'''
#Praise be to
#https://stackoverflow.com/questions/36937461/
#how-can-i-send-an-email-using-python-loggings-
#smtphandler-and-ssl
try:
port = self.mailport
if not port:
port = smtplib.SMTP_PORT
smtp = smtplib.SMTP_SSL(self.mailhost, port)
msg = self.format(record)
out = EmailMessage()
out['Subject'] = self.getSubject(record)
out['From'] = self.fromaddr
out['To'] = self.toaddrs
out.set_content(msg)
#global rec2
#rec2 = record
if self.username:
smtp.login(self.username, self.password)
#smtp.sendmail(self.fromaddr, self.toaddrs, msg)
#Attempting to send using smtp.sendmail as above
#results in messages with no text, so use
smtp.send_message(out)
smtp.quit()
except (KeyboardInterrupt, SystemExit):
raise
except: # pylint: disable=bare-except
self.handleError(record)
dryad2dataverse.exceptions
¶
Custom exceptions for error handling.
DatabaseError
¶
Bases: Dryad2DataverseError
Tracking database error.
Source code in src/dryad2dataverse/exceptions.py
class DatabaseError(Dryad2DataverseError):
'''
Tracking database error.
'''
DataverseBadApiKeyError
¶
Bases: Dryad2DataverseError
Returned on not OK respose (ie, request.request.json()[‘message’] == ‘Bad api key ‘).
Source code in src/dryad2dataverse/exceptions.py
class DataverseBadApiKeyError(Dryad2DataverseError):
'''
Returned on not OK respose (ie, request.request.json()['message'] == 'Bad api key ').
'''
DataverseDownloadError
¶
Bases: Dryad2DataverseError
Returned on not OK respose (ie, not requests.status_code == 200).
Source code in src/dryad2dataverse/exceptions.py
class DataverseDownloadError(Dryad2DataverseError):
'''
Returned on not OK respose (ie, not requests.status_code == 200).
'''
DataverseUploadError
¶
Bases: Dryad2DataverseError
Returned on not OK respose (ie, not requests.status_code == 200).
Source code in src/dryad2dataverse/exceptions.py
class DataverseUploadError(Dryad2DataverseError):
'''
Returned on not OK respose (ie, not requests.status_code == 200).
'''
DownloadSizeError
¶
Bases: Dryad2DataverseError
Raised when download sizes don’t match reported Dryad file size.
Source code in src/dryad2dataverse/exceptions.py
class DownloadSizeError(Dryad2DataverseError):
'''
Raised when download sizes don't match reported
Dryad file size.
'''
Dryad2DataverseError
¶
Bases: Exception
Base exception class for Dryad2Dataverse errors.
Source code in src/dryad2dataverse/exceptions.py
class Dryad2DataverseError(Exception):
'''
Base exception class for Dryad2Dataverse errors.
'''
HashError
¶
Bases: Dryad2DataverseError
Raised on hex digest mismatch.
Source code in src/dryad2dataverse/exceptions.py
class HashError(Dryad2DataverseError):
'''
Raised on hex digest mismatch.
'''
NoTargetError
¶
Bases: Dryad2DataverseError
No dataverse target supplied error.
Source code in src/dryad2dataverse/exceptions.py
class NoTargetError(Dryad2DataverseError):
'''
No dataverse target supplied error.
'''
dryad2dataverse.constants
¶
This module contains the information that configures all the parameters required to transfer data from Dryad to Dataverse.
“Constants” may be a bit strong, but the only constant is the presence of change.