Source code for pygeoapi.provider.geojson

# =================================================================
#
# Authors: Matthew Perry <perrygeo@gmail.com>
#
# Copyright (c) 2018 Matthew Perry
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================

import json
import logging
import os
import uuid

from pygeoapi.provider.base import (BaseProvider, ProviderItemNotFoundError,
                                    ProviderSchemaError,
                                    ProviderItemAlreadyExistsError)

LOGGER = logging.getLogger(__name__)


[docs]class GeoJSONProvider(BaseProvider):
    """
    Provider class backed by local GeoJSON files

    This is meant to be simple
    (no external services, no dependencies, no schema)
    at the expense of performance
    (no indexing, full serialization roundtrip on each request)

    Not thread safe, a single server process is assumed

    This implementation uses the feature 'id' heavily
    and will override any 'id' provided in the original data.
    The feature 'properties' will be preserved.

    TODO:
    * query method should take bbox
    * instead of methods returning FeatureCollections,
    we should be yielding Features and aggregating in the view
    * there are strict id semantics; all features in the input GeoJSON file
    must be present and be unique strings. Otherwise it will break.
    * How to raise errors in the provider implementation such that
    * appropriate HTTP responses will be raised
    """

    def __init__(self, provider_def):
        """initializer"""

        BaseProvider.__init__(self, provider_def)
        self.fields = self.get_fields()

[docs]    def get_fields(self):
        """
         Get provider field information (names, types)

        :returns: dict of fields
        """

        LOGGER.debug('Treating all columns as string types')
        if os.path.exists(self.data):
            with open(self.data) as src:
                data = json.loads(src.read())
            fields = {}
            for f in data['features'][0]['properties'].keys():
                fields[f] = 'string'
            return fields

    def get_all_fields(self, dict):
        fields = set()
        for f in dict:
            fields = fields.union(set(f['properties'].keys()))
        return fields

[docs]    def _load(self):
        """Load and validate the source GeoJSON file
        at self.data

        Yes loading from disk, deserializing and validation
        happens on every request. This is not efficient.
        """

        if os.path.exists(self.data):
            with open(self.data) as src:
                data = json.loads(src.read())
        else:
            data = {
                'type': 'FeatureCollection',
                'features': []}
        # Must be a FeatureCollection
        assert data['type'] == 'FeatureCollection'
        # All features must have ids, TODO must be unique strings
        for i in data['features']:
            if 'id' not in i and self.id_field in i['properties']:
                i['id'] = i['properties'][self.id_field]
        return data

[docs]    def _load_without_null(self):
        """Load and validate the source GeoJSON file
        at self.data with None values abscent

        Yes loading from disk, deserializing and validation
        happens on every request. This is not efficient.
        """

        data = self._load()
        for feature in data['features']:
            for prop in feature['properties']:
                if feature['properties'][prop] is None:
                    feature['properties'].pop(prop)
        return data

[docs]    def query(self, startindex=0, limit=10, resulttype='results',
              bbox=[], datetime=None, properties=[], sortby=[]):
        """
        query the provider

        :param startindex: starting record to return (default 0)
        :param limit: number of records to return (default 10)
        :param resulttype: return results or hit limit (default results)
        :param bbox: bounding box [minx,miny,maxx,maxy]
        :param datetime: temporal (datestamp or extent)
        :param properties: list of tuples (name, value)
        :param sortby: list of dicts (property, order)

        :returns: FeatureCollection dict of 0..n GeoJSON features
        """

        # TODO filter by bbox without resorting to third-party libs
        data = self._load()
        data['numberMatched'] = len(data['features'])
        if resulttype == 'hits':
            data['features'] = []
        else:
            data['features'] = data['features'][startindex:startindex+limit]
            data['numberReturned'] = len(data['features'])
        return data

    def generate_unique_id(self):
        feats = self._load()['features']
        samp_id_type = type(feats[0].get('id'))
        if isinstance(samp_id_type, int):
            ids = set([feat.get('id', None) or
                       feat['properties'].get(self.id_field)
                       for feat in feats])
            id = 0
            while True:
                if id not in ids:
                    return id
                id = id + 1
        if isinstance(samp_id_type, str):
            return str(uuid.uuid4())

[docs]    def get(self, identifier):
        """
        query the provider by id

        :param identifier: feature id

        :returns: dict of single GeoJSON feature
        """

        all_data = self._load()
        samp_feat = all_data['features'][0]
        id_type = type(samp_feat['id'])
        for feature in all_data['features']:
            if feature['id'] == id_type(identifier):
                return feature
        # default, no match
        err = 'item {} not found'.format(identifier)
        LOGGER.error(err)
        raise ProviderItemNotFoundError(err)

[docs]    def create(self, new_feature):
        """
        create a new feature item

        :param new_feature: new GeoJSON feature dictionary

        :returns: feature id
        """

        all_data = self._load()
        samp_feat = all_data['features'][0]
        id_field = self.id_field
        nfid = new_feature.get('id', None) or\
            new_feature['properties'].get(id_field, None)

        if nfid is not None:
            for feature in all_data['features']:
                if feature['id'] == nfid:
                    err = 'provider item {} already exists'\
                                .format(nfid)
                    LOGGER.error(err)
                    raise ProviderItemAlreadyExistsError(err)
        else:
            nfid = self.generate_unique_id()

        curr_cols = self.get_all_fields(all_data['features']) - {id_field}
        new_cols = set(new_feature['properties'].keys()) - {id_field}
        # if given data has extra properties not in schema
        if bool(new_cols - curr_cols):
            err = 'properties {} not prescent in provider schema'\
                .format(new_cols - curr_cols)
            LOGGER.error(err)
            raise ProviderSchemaError(err)

        # set id field as per schema in file
        if id_field in samp_feat['properties']:
            new_feature['properties'][id_field] = nfid
        else:
            new_feature['id'] = nfid
        # set missing properties to empty
        for prop in curr_cols - new_cols:
            new_feature['properties'][prop] = None

        all_data['features'].append(new_feature)
        with open(self.data, 'w') as dst:
            dst.write(json.dumps(all_data, indent=2, sort_keys=True))
        return nfid

[docs]    def replace(self, identifier, new_feature):
        """
        replace an existing feature item with new_feature item

        :param identifier: feature id
        :param new_feature: new GeoJSON feature dictionary
        """

        all_data = self._load()
        id_field = self.id_field
        samp_feat = all_data['features'][0]
        id_type = type(samp_feat['id'])

        # flag if id is already prescent in collection
        found_feature = False
        for index, feature in enumerate(all_data['features']):
            if feature['id'] == id_type(identifier):
                found_feature = True
                break

        # id is abscent in collection
        if not found_feature:
            err = 'item {} not found'.format(identifier)
            LOGGER.error(err)
            raise ProviderItemNotFoundError(err)

        # if given data has extra properties not in schema
        curr_cols = self.get_all_fields(self._load()['features']) - {id_field}
        new_cols = set(new_feature['properties'].keys()) - {id_field}
        if bool(new_cols - curr_cols):
            err = 'properties {} not prescent in provider schema'\
                .format(new_cols - curr_cols)
            LOGGER.error(err)
            raise ProviderSchemaError(err)

        # set id field
        if id_field in samp_feat['properties']:
            new_feature['properties'][id_field] = feature['id']
        else:
            new_feature['id'] = feature['id']

        # set missing properties to empty
        for prop in curr_cols - new_cols:
            new_feature['properties'][prop] = None
        all_data['features'][index] = new_feature
        # clean up empty attributes
        remove_set = set()
        for attrib in curr_cols - new_cols:
            empt = True
            for feature in all_data['features']:
                if feature['properties'][attrib] is not None:
                    empt = False
                    break
            if empt:
                remove_set.add(attrib)
        for attrib in remove_set:
            for feature in all_data['features']:
                feature['properties'].pop(attrib)

        with open(self.data, 'w') as dst:
            dst.write(json.dumps(all_data, indent=2, sort_keys=True))

[docs]    def update(self, identifier, updates):
        """
        update an existing feature item

        :param identifier: feature id
        :param updates: updates dictionary

        :returns: feature item
        """

        id_field = self.id_field

        all_data = self._load()
        samp_feat = all_data['features'][0]
        id_type = type(samp_feat['id'])

        curr_cols = self.get_all_fields(all_data['features']) - {id_field}

        found_feature = False
        for index, feature in enumerate(all_data['features']):
            if feature['id'] == id_type(identifier):
                found_feature = True
                break

        if not found_feature:
            err = 'item {} not found'.format(identifier)
            LOGGER.error(err)
            raise ProviderItemNotFoundError(err)
        else:
            # add an attribute if its not already prescent in the feature
            if 'add' in updates:
                for name_val_pair in updates['add']:
                    name = name_val_pair['name']
                    value = name_val_pair['value']
                    if name not in curr_cols:
                        for f in all_data['features']:
                            f['properties'][name] = None
                        feature['properties'][name] = value
                    else:
                        err = 'property {} exists in given provider item'\
                            .format(name)
                        LOGGER.error(err)
                        raise ProviderSchemaError(err)

            # modify an attribute if its  already prescent in the feature
            if 'modify' in updates:
                for name_val_pair in updates['modify']:
                    name = name_val_pair['name']
                    value = name_val_pair['value']
                    if name in self.get_all_fields(all_data['features']):
                        feature['properties'][name] = value
                    else:
                        err = 'property {} dont exist in given provider item'\
                            .format(name)
                        raise ProviderSchemaError(err)

            # delete an attribute if its prescent in the feature
            if 'remove' in updates:
                for name in updates['remove']:
                    if name in curr_cols and \
                       feature['properties'][name] is not None:
                        feature['properties'][name] = None
                        empt = True
                        for f in all_data['features']:
                            if f['properties'][name] is not None:
                                empt = False
                                break
                        if empt:
                            for f in all_data['features']:
                                f['properties'].pop(name)
                    else:
                        err = 'property {} doesnt exists for given \
                               provider item'.format(name)
                        raise ProviderSchemaError(err)

            all_data['features'][index] = feature

            # clean up empty attributes
            curr_cols = self.get_all_fields(all_data['features']) - {id_field}
            remove_set = set()
            for attrib in curr_cols:
                empt = True
                for feature in all_data['features']:
                    if feature['properties'][attrib] is not None:
                        empt = False
                        break
                if empt:
                    remove_set.add(attrib)
            for attrib in remove_set:
                for feature in all_data['features']:
                    feature['properties'].pop(attrib)

            with open(self.data, 'w') as dst:
                dst.write(json.dumps(all_data, indent=2, sort_keys=True))

            feature = all_data['features'][index]
            return feature

[docs]    def delete(self, identifier):
        """
        deletes an existing feature item

        :param identifier: feature id
        """

        id_field = self.id_field
        all_data = self._load()
        samp_feat = all_data['features'][0]
        id_type = type(samp_feat['id'])

        found_feature = False
        for index, feature in enumerate(all_data['features']):
            if feature['id'] == id_type(identifier):
                found_feature = True
                break

        if not found_feature:
            err = 'item {} not found'.format(identifier)
            LOGGER.error(err)
            raise ProviderItemNotFoundError(err)

        all_data['features'].pop(index)

        # clean up empty attributes
        curr_cols = self.get_all_fields(all_data['features']) - {id_field}
        remove_set = set()
        for attrib in curr_cols:
            empt = True
            for feature in all_data['features']:
                if feature['properties'][attrib] is not None:
                    empt = False
                    break
            if empt:
                remove_set.add(attrib)
        for attrib in remove_set:
            for feature in all_data['features']:
                feature['properties'].pop(attrib)

        with open(self.data, 'w') as dst:
            dst.write(json.dumps(all_data, indent=2, sort_keys=True))

    def __repr__(self):
        return '<GeoJSONProvider> {}'.format(self.data)