update from Atlas
This commit is contained in:
261
support/isis2json/isis2json.py
Executable file
261
support/isis2json/isis2json.py
Executable file
@@ -0,0 +1,261 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
# isis2json.py: convert ISIS and ISO-2709 files to JSON
|
||||
#
|
||||
# Copyright (C) 2010 BIREME/PAHO/WHO
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Lesser General Public License as published
|
||||
# by the Free Software Foundation, either version 2.1 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Lesser General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU Lesser General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
############################
|
||||
# BEGIN ISIS2JSON
|
||||
# this script works with Python or Jython (versions >=2.5 and <3)
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from uuid import uuid4
|
||||
import os
|
||||
|
||||
try:
|
||||
import json
|
||||
except ImportError:
|
||||
if os.name == 'java': # running Jython
|
||||
from com.xhaus.jyson import JysonCodec as json
|
||||
else:
|
||||
import simplejson as json
|
||||
|
||||
SKIP_INACTIVE = True
|
||||
DEFAULT_QTY = 2**31
|
||||
ISIS_MFN_KEY = 'mfn'
|
||||
ISIS_ACTIVE_KEY = 'active'
|
||||
SUBFIELD_DELIMITER = '^'
|
||||
INPUT_ENCODING = 'cp1252'
|
||||
|
||||
|
||||
def iter_iso_records(iso_file_name, isis_json_type): # <1>
|
||||
from iso2709 import IsoFile
|
||||
from subfield import expand
|
||||
|
||||
iso = IsoFile(iso_file_name)
|
||||
for record in iso:
|
||||
fields = {}
|
||||
for field in record.directory:
|
||||
field_key = str(int(field.tag)) # remove leading zeroes
|
||||
field_occurrences = fields.setdefault(field_key, [])
|
||||
content = field.value.decode(INPUT_ENCODING, 'replace')
|
||||
if isis_json_type == 1:
|
||||
field_occurrences.append(content)
|
||||
elif isis_json_type == 2:
|
||||
field_occurrences.append(expand(content))
|
||||
elif isis_json_type == 3:
|
||||
field_occurrences.append(dict(expand(content)))
|
||||
else:
|
||||
raise NotImplementedError('ISIS-JSON type %s conversion '
|
||||
'not yet implemented for .iso input' % isis_json_type)
|
||||
|
||||
yield fields
|
||||
iso.close()
|
||||
|
||||
|
||||
def iter_mst_records(master_file_name, isis_json_type): # <2>
|
||||
try:
|
||||
from bruma.master import MasterFactory, Record
|
||||
except ImportError:
|
||||
print('IMPORT ERROR: Jython 2.5 and Bruma.jar '
|
||||
'are required to read .mst files')
|
||||
raise SystemExit
|
||||
mst = MasterFactory.getInstance(master_file_name).open()
|
||||
for record in mst:
|
||||
fields = {}
|
||||
if SKIP_INACTIVE:
|
||||
if record.getStatus() != Record.Status.ACTIVE:
|
||||
continue
|
||||
else: # save status only there are non-active records
|
||||
fields[ISIS_ACTIVE_KEY] = (record.getStatus() ==
|
||||
Record.Status.ACTIVE)
|
||||
fields[ISIS_MFN_KEY] = record.getMfn()
|
||||
for field in record.getFields():
|
||||
field_key = str(field.getId())
|
||||
field_occurrences = fields.setdefault(field_key, [])
|
||||
if isis_json_type == 3:
|
||||
content = {}
|
||||
for subfield in field.getSubfields():
|
||||
subfield_key = subfield.getId()
|
||||
if subfield_key == '*':
|
||||
content['_'] = subfield.getContent()
|
||||
else:
|
||||
subfield_occurrences = content.setdefault(subfield_key, [])
|
||||
subfield_occurrences.append(subfield.getContent())
|
||||
field_occurrences.append(content)
|
||||
elif isis_json_type == 1:
|
||||
content = []
|
||||
for subfield in field.getSubfields():
|
||||
subfield_key = subfield.getId()
|
||||
if subfield_key == '*':
|
||||
content.insert(0, subfield.getContent())
|
||||
else:
|
||||
content.append(SUBFIELD_DELIMITER + subfield_key +
|
||||
subfield.getContent())
|
||||
field_occurrences.append(''.join(content))
|
||||
else:
|
||||
raise NotImplementedError('ISIS-JSON type %s conversion '
|
||||
'not yet implemented for .mst input' % isis_json_type)
|
||||
yield fields
|
||||
mst.close()
|
||||
|
||||
|
||||
def write_json(input_gen, file_name, output, qty, skip, id_tag, # <3>
|
||||
gen_uuid, mongo, mfn, isis_json_type, prefix,
|
||||
constant):
|
||||
start = skip
|
||||
end = start + qty
|
||||
if id_tag:
|
||||
id_tag = str(id_tag)
|
||||
ids = set()
|
||||
else:
|
||||
id_tag = ''
|
||||
for i, record in enumerate(input_gen):
|
||||
if i >= end:
|
||||
break
|
||||
if not mongo:
|
||||
if i == 0:
|
||||
output.write('[')
|
||||
elif i > start:
|
||||
output.write(',')
|
||||
if start <= i < end:
|
||||
if id_tag:
|
||||
occurrences = record.get(id_tag, None)
|
||||
if occurrences is None:
|
||||
msg = 'id tag #%s not found in record %s'
|
||||
if ISIS_MFN_KEY in record:
|
||||
msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
|
||||
raise KeyError(msg % (id_tag, i))
|
||||
if len(occurrences) > 1:
|
||||
msg = 'multiple id tags #%s found in record %s'
|
||||
if ISIS_MFN_KEY in record:
|
||||
msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
|
||||
raise TypeError(msg % (id_tag, i))
|
||||
else: # ok, we have one and only one id field
|
||||
if isis_json_type == 1:
|
||||
id = occurrences[0]
|
||||
elif isis_json_type == 2:
|
||||
id = occurrences[0][0][1]
|
||||
elif isis_json_type == 3:
|
||||
id = occurrences[0]['_']
|
||||
if id in ids:
|
||||
msg = 'duplicate id %s in tag #%s, record %s'
|
||||
if ISIS_MFN_KEY in record:
|
||||
msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
|
||||
raise TypeError(msg % (id, id_tag, i))
|
||||
record['_id'] = id
|
||||
ids.add(id)
|
||||
elif gen_uuid:
|
||||
record['_id'] = unicode(uuid4())
|
||||
elif mfn:
|
||||
record['_id'] = record[ISIS_MFN_KEY]
|
||||
if prefix:
|
||||
# iterate over a fixed sequence of tags
|
||||
for tag in tuple(record):
|
||||
if str(tag).isdigit():
|
||||
record[prefix+tag] = record[tag]
|
||||
del record[tag] # this is why we iterate over a tuple
|
||||
# with the tags, and not directly on the record dict
|
||||
if constant:
|
||||
constant_key, constant_value = constant.split(':')
|
||||
record[constant_key] = constant_value
|
||||
output.write(json.dumps(record).encode('utf-8'))
|
||||
output.write('\n')
|
||||
if not mongo:
|
||||
output.write(']\n')
|
||||
|
||||
|
||||
def main(): # <4>
|
||||
# create the parser
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert an ISIS .mst or .iso file to a JSON array')
|
||||
|
||||
# add the arguments
|
||||
parser.add_argument(
|
||||
'file_name', metavar='INPUT.(mst|iso)',
|
||||
help='.mst or .iso file to read')
|
||||
parser.add_argument(
|
||||
'-o', '--out', type=argparse.FileType('w'), default=sys.stdout,
|
||||
metavar='OUTPUT.json',
|
||||
help='the file where the JSON output should be written'
|
||||
' (default: write to stdout)')
|
||||
parser.add_argument(
|
||||
'-c', '--couch', action='store_true',
|
||||
help='output array within a "docs" item in a JSON document'
|
||||
' for bulk insert to CouchDB via POST to db/_bulk_docs')
|
||||
parser.add_argument(
|
||||
'-m', '--mongo', action='store_true',
|
||||
help='output individual records as separate JSON dictionaries,'
|
||||
' one per line for bulk insert to MongoDB via mongoimport utility')
|
||||
parser.add_argument(
|
||||
'-t', '--type', type=int, metavar='ISIS_JSON_TYPE', default=1,
|
||||
help='ISIS-JSON type, sets field structure: 1=string, 2=alist, 3=dict (default=1)')
|
||||
parser.add_argument(
|
||||
'-q', '--qty', type=int, default=DEFAULT_QTY,
|
||||
help='maximum quantity of records to read (default=ALL)')
|
||||
parser.add_argument(
|
||||
'-s', '--skip', type=int, default=0,
|
||||
help='records to skip from start of .mst (default=0)')
|
||||
parser.add_argument(
|
||||
'-i', '--id', type=int, metavar='TAG_NUMBER', default=0,
|
||||
help='generate an "_id" from the given unique TAG field number'
|
||||
' for each record')
|
||||
parser.add_argument(
|
||||
'-u', '--uuid', action='store_true',
|
||||
help='generate an "_id" with a random UUID for each record')
|
||||
parser.add_argument(
|
||||
'-p', '--prefix', type=str, metavar='PREFIX', default='',
|
||||
help='concatenate prefix to every numeric field tag (ex. 99 becomes "v99")')
|
||||
parser.add_argument(
|
||||
'-n', '--mfn', action='store_true',
|
||||
help='generate an "_id" from the MFN of each record'
|
||||
' (available only for .mst input)')
|
||||
parser.add_argument(
|
||||
'-k', '--constant', type=str, metavar='TAG:VALUE', default='',
|
||||
help='Include a constant tag:value in every record (ex. -k type:AS)')
|
||||
|
||||
'''
|
||||
# TODO: implement this to export large quantities of records to CouchDB
|
||||
parser.add_argument(
|
||||
'-r', '--repeat', type=int, default=1,
|
||||
help='repeat operation, saving multiple JSON files'
|
||||
' (default=1, use -r 0 to repeat until end of input)')
|
||||
'''
|
||||
# parse the command line
|
||||
args = parser.parse_args()
|
||||
if args.file_name.lower().endswith('.mst'):
|
||||
input_gen_func = iter_mst_records # <5>
|
||||
else:
|
||||
if args.mfn:
|
||||
print('UNSUPORTED: -n/--mfn option only available for .mst input.')
|
||||
raise SystemExit
|
||||
input_gen_func = iter_iso_records # <6>
|
||||
input_gen = input_gen_func(args.file_name, args.type) # <7>
|
||||
if args.couch:
|
||||
args.out.write('{ "docs" : ')
|
||||
write_json(input_gen, args.file_name, args.out, args.qty, # <8>
|
||||
args.skip, args.id, args.uuid, args.mongo, args.mfn,
|
||||
args.type, args.prefix, args.constant)
|
||||
if args.couch:
|
||||
args.out.write('}\n')
|
||||
args.out.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
# END ISIS2JSON
|
||||
Reference in New Issue
Block a user