Examples

These examples will introduce basic subsetting, formatting, and computation in OpenClimateGIS.

Inspection

First, it is always a good idea to ensure a dataset is readable by OpenClimateGIS. Furthermore, it is also good to check if variables/dimensions are properly identifiable and match expectations.

from ocgis import RequestDataset
from ocgis.test.base import create_gridxy_global, create_exact_field
from ocgis.variable.crs import Spherical

# Create synthetic data for this example.
grid = create_gridxy_global(resolution=5.0)
field = create_exact_field(grid, 'exact', ntime=31, crs=Spherical())
field.write('ocgis_example_inspect_request_dataset.nc')

# Create the request dataset object.
rd = RequestDataset('ocgis_example_inspect_request_dataset.nc')

# Provides a metadata dump for the request dataset.
rd.inspect()

# These are the auto-discovered data variable names.
assert rd.variable == 'exact'

# The dimension map provides information on how OCGIS will interpret the dataset.
rd.dimension_map.pprint()

Use of snippet

The snippet argument is important when testing and developing an OpenClimateGIS call. It should generally be set to True until the final data request is ready. This is for your benefit (requests are faster) as well as for the benefit of any remote storage server (not transferring excessive data).

Simple Subsetting

Warning

The 'csv-shp' output_format is highly recommended as writing data to pure ESRI Shapefiles / CSV may result in large file sizes. For each record, an ESRI Shapefile repeats the geometry storage.

This example will introduce simple subsetting by a bounding box with conversion to in-memory NumPy arrays, shapefile, CSV, and keyed formats.

import os
import tempfile

import ocgis
from ocgis.test.base import create_gridxy_global, create_exact_field

# Name of the variable to subset.
VAR_TAS = 'tas'
# Make it easy to switch to non-snippet requests.
SNIPPET = True
# Set output directory for shapefile and keyed formats. (MAKE SURE IT EXISTS!)
ocgis.env.DIR_OUTPUT = tempfile.mkdtemp()
print ocgis.env.DIR_OUTPUT
# The bounding box coordinates [minx, miny, maxx, maxy] for the state of Colorado in WGS84 latitude/longitude
# coordinates.
BBOX = [-109.1, 36.9, -102.0, 41.0]

# Create synthetic data for this example.
grid = create_gridxy_global(resolution=5.0)
field = create_exact_field(grid, VAR_TAS, ntime=31)
data_path = os.path.join(ocgis.env.DIR_OUTPUT, 'ocgis_example_simple_subset.nc')
field.write(data_path)

# This object will be reused so just build it once. Variable names are typically auto-discovered.
rd = ocgis.RequestDataset(data_path, VAR_TAS)

########################################################################################################################
# Returning an OCGIS spatial collection

ret = ocgis.OcgOperations(dataset=rd, geom=BBOX, snippet=SNIPPET).execute()

########################################################################################################################
# Returning conversions

output_formats = ['shp', 'csv', 'csv-shp', 'nc']
for output_format in output_formats:
    prefix = output_format + '_output'
    ops = ocgis.OcgOperations(dataset=rd, geom=BBOX, snippet=SNIPPET, output_format=output_format, prefix=prefix)
    ret = ops.execute()

print('simple_subset Example Finished')

Now, the directory structure for the temporary directory will look like:

$ tree /tmp/tmpO900zS
/tmp/tmpO900zS
├── csv_output
│   ├── csv_output.csv
│   ├── csv_output_did.csv
│   ├── csv_output_metadata.txt
│   └── csv_output_source_metadata.txt
├── csv-shp_output
│   ├── csv-shp_output.csv
│   ├── csv-shp_output_did.csv
│   ├── csv-shp_output_metadata.txt
│   ├── csv-shp_output_source_metadata.txt
│   └── shp
│       ├── csv-shp_output_gid.cpg
│       ├── csv-shp_output_gid.dbf
│       ├── csv-shp_output_gid.prj
│       ├── csv-shp_output_gid.shp
│       ├── csv-shp_output_gid.shx
│       ├── csv-shp_output_ugid.cpg
│       ├── csv-shp_output_ugid.dbf
│       ├── csv-shp_output_ugid.prj
│       ├── csv-shp_output_ugid.shp
│       └── csv-shp_output_ugid.shx
├── nc_output
│   ├── nc_output_did.csv
│   ├── nc_output_metadata.txt
│   ├── nc_output.nc
│   └── nc_output_source_metadata.txt
├── ocgis_example_simple_subset.nc
└── shp_output
    ├── shp_output.cpg
    ├── shp_output.dbf
    ├── shp_output_did.csv
    ├── shp_output_metadata.txt
    ├── shp_output.prj
    ├── shp_output.shp
    ├── shp_output.shx
    ├── shp_output_source_metadata.txt
    ├── shp_output_ugid.cpg
    ├── shp_output_ugid.dbf
    ├── shp_output_ugid.prj
    ├── shp_output_ugid.shp
    └── shp_output_ugid.shx

5 directories, 36 files

Advanced Subsetting

In this example, a U.S. state boundary shapefile will be used to subset and aggregate three example climate datasets. The aggregation will occur on a per-geometry + dataset basis. Hence, we will end up with daily aggregated “statewide” temperatures for the three climate variables. We also want to clip the climate data cells to the boundary of the selection geometry to take advantage of area-weighting and avoid repeated grid cells.

import os

import ocgis
from ocgis import OcgOperations, RequestDataset, env
from ocgis.test.base import create_gridxy_global, create_exact_field

# Only return the first time slice.
SNIPPET = True
# Data returns will overwrite in this case. Use with caution!!
env.OVERWRITE = True
# This is where to find the shapfiles.
ocgis.env.DIR_GEOMCABINET = os.path.join(os.getcwd(), os.path.split(ocgis.test.__file__)[0], 'bin')

########################################################################################################################
# Write example datasets for use in this example.

grid = create_gridxy_global(resolution=3.0)
vars = ['ocgis_example_tasmin', 'ocgis_example_tas', 'ocgis_example_tasmax']
paths = ['{}.nc'.format(ii) for ii in vars]
field_names = ['tasmin', 'tas', 'tasmax']
for idx, (path, var) in enumerate(zip(paths, vars)):
    field = create_exact_field(grid.copy(), var, ntime=3)
    field.data_variables[0].get_value()[:] = idx + 1
    field.write(path)
# for path in paths:
#     RequestDataset(path).inspect()

########################################################################################################################
# Create the request dataset objects for the file paths we'd like to subset. Variable will often be auto-discovered and
# default field names will be created. We'll just pass in everything here.

rds = [RequestDataset(uri=uri, variable=var, field_name=field_name) for uri, var, field_name in
       zip(paths, vars, field_names)]

########################################################################################################################
# Return in-memory as an OCGIS collection for a single geometry.

# Return a SpatialCollection, but only for a target state in a U.S. state boundaries shapefile. In this case, the UGID 
# attribute value of 23 is associated with Nebraska.
print('Returning OCGIS format for a state...')
ops = OcgOperations(dataset=rds, spatial_operation='clip', aggregate=True, snippet=SNIPPET, geom='state_boundaries',
                    geom_select_uid=[16])
ret = ops.execute()

# Find the geometry field in the returned collection.
nebraska = ret.children[16]
# Or...
nebraska = ret.groups[16]
# Assert some things about the geometry field.
assert nebraska['STATE_NAME'].get_value()[0] == 'Nebraska'
assert nebraska.geom.geom_type == 'Polygon'

# The Nebraska subset geometry field contains the subsetted and aggregated data for each request dataset. Note how we
# overloaded the field names.
assert nebraska.children.keys() == field_names

# Get the actual aggregated data value. With only one subset geometry, "container_ugid" is not really necessary. By
# default, the first field will be returned without the container identifier. Check the values are what we expect.
for idx, (field_name, var_name) in enumerate(zip(field_names, vars)):
    aggregated_variable = ret.get_element(container_ugid=16, field_name=field_name, variable_name=var_name)
    assert aggregated_variable.get_value().mean() == idx + 1

########################################################################################################################
# Return data as the default OCGIS spatial collection output for all state boundaries.

print('Returning OCGIS format for all states...')
ops = OcgOperations(dataset=rds, spatial_operation='clip', aggregate=True, snippet=SNIPPET, geom='state_boundaries')
ret = ops.execute()
assert len(ret.geoms) == 51

########################################################################################################################
# Write to shapefile

print('Creating ESRI Shapefile...')
ops = OcgOperations(dataset=rds, spatial_operation='clip', aggregate=True, snippet=SNIPPET, geom='state_boundaries',
                    output_format='shp')
path = ops.execute()
assert os.path.exists(path)

########################################################################################################################
# Write to linked CSV and ESRI Shapefile

# Without the snippet, we are writing all data to the linked CSV-Shapefile output format. The operation will take 
# considerably longer.
print('Creating linked CSV and ESRI Shapefile...')
ops = OcgOperations(dataset=rds, spatial_operation='clip', aggregate=True, snippet=False, geom='state_boundaries',
                    output_format='csv-shp')
path = ops.execute()
assert os.path.exists(path)

Subsetting with a Time/Level Range

Adding a time or level range subset is done at the RequestDataset level.

Warning

Datetime ranges are absolute and inclusive.

For example, the code below will return all data from the year 2000 for the first two levels. Level indexing originates at 1.

>>> from ocgis import OcgOperations, RequestDataset
>>> import datetime
>>> # Depending on your data's time resolution, the hour/minute/second/etc. may be important for capturing all the data
>>> # within the range.
>>> tr = [datetime.datetime(2000, 1, 1),datetime.datetime(2000, 12, 31, 23, 59, 59)]
>>> rd = RequestDataset('/my/leveled/data', 'tas', time_range=tr, level_range=[1, 2])
>>> ret = OcgOperations(dataset=rd).execute()

Configuring a Dimension Map

Dimension maps (DimensionMap) are used to overload default metadata interpretations. If provided to a request (RequestDataset) or field (Field), it will be used to intepret a variable collection when converting the collection to a field.

This example shows how to pass a dimension map to a request when working with non-standard metadata:

import os

import ocgis
from ocgis.constants import DimensionMapKey

OUT_NC = os.path.join(os.getcwd(), 'ocgis_example_dimension_map.nc')

########################################################################################################################
# Write some initial data.

var_x = ocgis.Variable(name='nonstandard_x', value=[10, 20, 30], dimensions='nsx')
var_y = ocgis.Variable(name='nonstandard_y', value=[40, 50, 60, 70], dimensions='nsy')
var_t = ocgis.Variable(name='nonstandard_time', value=[1, 2, 3], units='days since 2000-1-1',
                       attrs={'calendar': 'standard'}, dimensions='nst')
data_dimensions = [var_t.dimensions[0], var_x.dimensions[0], var_y.dimensions[0]]
var_data = ocgis.Variable(name='some_data', dimensions=data_dimensions)

vc = ocgis.VariableCollection(variables=[var_x, var_y, var_t, var_data])
vc.write(OUT_NC)

########################################################################################################################
# This metadata is not self-describing. Hence, no data or coordinate variables are interpretable. OpenClimateGIS will
# use standard names for coordinate variables, but these names are not standard!

rd = ocgis.RequestDataset(OUT_NC)

try:
    assert rd.variable
except ocgis.exc.NoDataVariablesFound:
    pass

########################################################################################################################
# Construct the dimension map as an object.

dmap = ocgis.DimensionMap()
dmap.set_variable(DimensionMapKey.TIME, var_t)
dmap.set_variable(DimensionMapKey.X, var_x)
dmap.set_variable(DimensionMapKey.Y, var_y)

rd = ocgis.RequestDataset(OUT_NC, dimension_map=dmap)
assert rd.variable == var_data.name

########################################################################################################################
# Construct the dimension map using a dictionary.

dmap = {'time': {'dimension': ['nst'],
                 'variable': 'nonstandard_time'},
        'x': {'dimension': ['nsx'],
              'variable': 'nonstandard_x'},
        'y': {'dimension': ['nsy'],
              'variable': 'nonstandard_y'}}

rd = ocgis.RequestDataset(OUT_NC, dimension_map=dmap)
assert rd.variable == var_data.name

Using the Data Interface in Parallel

Standard operations scripts may be run in parallel using mpirun with no special code. See Explanatory Parallel Example for an advanced example using OpenClimateGIS’s data interface in parallel.

Regridding

import os
import tempfile

import ESMF

import ocgis

# global model grid with ~3 degree resolution
URI1 = os.path.expanduser('~/climate_data/CanCM4/tas_day_CanCM4_decadal2000_r2i1p1_20010101-20101231.nc')
# downscaled model grid covering the conterminous United States with ~1/8 degree resolution
URI2 = os.path.expanduser('~/climate_data/maurer/bcca/obs/tasmax/1_8deg/gridded_obs.tasmax.OBS_125deg.daily.1991.nc')
ocgis.env.DIR_OUTPUT = tempfile.gettempdir()

########################################################################################################################
# Simple regridding example with a bounding box subset writing to netCDF using conservative regridding

bbox = [-104, 36, -95, 44]
# Regrid the global dataset to the downscaled grid.
rd_global = ocgis.RequestDataset(uri=URI1)
rd_downscaled = ocgis.RequestDataset(uri=URI2)
ops = ocgis.OcgOperations(dataset=rd_global, regrid_destination=rd_downscaled, geom=bbox, output_format='nc',
                          prefix='with_corners')
ret = ops.execute()

########################################################################################################################
# Regrid using bilinear interpolation (i.e. without corners)

rd_global = ocgis.RequestDataset(uri=URI1)
rd_downscaled = ocgis.RequestDataset(uri=URI2)
regrid_options = {'regrid_method': ESMF.RegridMethod.BILINEAR}
ops = ocgis.OcgOperations(dataset=rd_global, regrid_destination=rd_downscaled, geom=bbox, output_format='nc',
                          regrid_options=regrid_options, prefix='without_corners')
ret = ops.execute()

Calculating TG90p using icclim

TG90p is a European Climate Assessment (ECA) climate indice.

from ocgis import RequestDataset, OcgOperations
from ocgis.contrib.library_icclim import IcclimTG90p

########################################################################################################################
# Compute a custom percentile basis using ICCLIM.

# Path to CF climate dataset. This examples uses the same file for indice and percentile basis calculation.
in_file = '/path/to/cf_data.nc'

# Subset the input dataset to return the desired base period for the percentile basis.
variable = 'tas'
years = range(2001, 2003)  # A custom date range may be required for your data
time_region = {'year': years}
rd = RequestDataset(uri=in_file, variable=variable)
field = rd.create_field()
field = field.time.get_time_region(time_region).parent

# Calculate the percentile basis. The data values must be a three-dimensional array.
arr = field[variable].get_masked_value().squeeze()  # This is the field data to use for the calculation
dt_arr = field.temporal.value_datetime  # This is an array of datetime objects.
percentile = 90
window_width = 5
t_calendar, t_units = field.time.calendar, field.time.units  # ICCLIM requires calendar and units for the calculation
percentile_dict = IcclimTG90p.get_percentile_dict(arr, dt_arr, percentile, window_width, t_calendar, t_units)

########################################################################################################################
# Calculate indice using custom percentile basis.

# Depending on the size of the data, this computation may take some time...
calc = [{'func': 'icclim_TG90p', 'name': 'TG90p', 'kwds': {'percentile_dict': percentile_dict}}]
calc_grouping = 'month'
# Returns data as an in-memory spatial collection
ops = OcgOperations(dataset=rd, calc=calc, calc_grouping=calc_grouping)
coll = ops.execute()

Stack Subsetted Data From Spatial Collections

Stack (concatenate) subsetted data from multiple files using the unlimited time dimension.

from ocgis import OcgOperations
from ocgis.test import create_gridxy_global, create_exact_field
from ocgis.variable import stack

# Create data files using the CF-Grid metadata convention with three timesteps per file ################################

filenames = []
for ii in range(1, 4):
    grid = create_gridxy_global()
    field = create_exact_field(grid, 'data', ntime=3)
    field.time.v()[:] += 10 * ii
    field['data'].v()[:] += 10 * ii
    currfn = 'ocgis_example_stacking_subsets_{}.nc'.format(ii)
    filenames.append(currfn)
    field.write(currfn)

########################################################################################################################

# Subset each file created above using a bounding box and return the data as a spatial collection.
colls = [OcgOperations(dataset={'uri': fn}, geom=[40, 30, 50, 60]).execute() for fn in filenames]
# Extract the fields to stack from each spatial collection.
colls_to_stack = [coll.get_element() for coll in colls]
# Stack the data along the time dimension returning a field.
stacked = stack(colls_to_stack, 'time')

Convert Nonstandard Units

Convert units not supported by the cfunits conversion library.

from ocgis.test import create_gridxy_global, create_exact_field

# Create an in-memory field.
grid = create_gridxy_global()
field = create_exact_field(grid, 'a_var')

# Set the test variable's units.
field['a_var'].units = 'odd_source_units'

# Another way to set the source units...
# field['a_var'].attrs['units'] = 'odd_source_units'

# These calls retrieve the underlying data values without a mask.
# value = field['a_var'].v()
# value = field['a_var'].get_value()

# It is best to work with the masked values unless performance is an issue.
masked_value = field['a_var'].mv()
# masked_value = field['a_var'].get_masked_value()

# Convert the units in-place and update the units attribute on the target variable.
masked_value[:] *= 1.75
field['a_var'].units = 'odd_destination_units'