Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

""" Handle chunks/chunksize related logic 

 

Chunks vs chunksizes:: 

 

"Chunks" refers to a collection of chunk sizes organized by dimension 

 

* e.g., ``{'time': (3, 3, 3, 1, )}`` 

* For :py:class:`dask.array.Array` and :py:class:`xarray.DataArray`, 

``.chunks`` is a tuple 

* :py:class:`xarray.Dataset` ``.chunks`` is a mapping 

 

"Chunksizes" refers to a scalar size (an integer) organized by dimension 

 

* e.g., ``{'time': 3}`` 

* ``chunksizes`` is used in encoding for NetCDF4 xarray backend 

 

""" 

from collections import Counter, OrderedDict, defaultdict 

from functools import singledispatch 

import logging 

from pathlib import Path 

 

import xarray as xr 

 

from ..utils import register_multi_singledispatch 

 

logger = logging.getLogger(__name__) 

 

 

# ---------------------------------------------------------------------------- 

# Read chunks from files 

def read_chunks(filename, variables=None): 

""" Return chunks associated with each variable if possible 

 

Parameters 

---------- 

filename : str 

Read chunks from this file 

variables : Sequence 

Subset of variables to retrieve ``chunking`` for 

 

Returns 

------- 

Mapping[str, Mapping[str, int]] 

Mapping of variable names to chunks. Chunks are stored 

mapping dimension name to chunksize (e.g., ``{'x': 250}``) 

 

Raises 

------ 

ValueError 

Raised if no chunks can be determined (unknown file format, etc.) 

""" 

read_funcs = (read_chunks_netcdf4, ) 

for func in read_funcs: 

try: 

var_chunks = func(filename, variables=variables) 

except Exception as e: 

logger.debug(f'Could not determine chunks for "{filename}" ' 

f'using "{func.__name__}"', exc_info=True) 

else: 

return var_chunks 

raise ValueError(f'Could not determine chunks for "{filename}"') 

 

 

def read_chunks_netcdf4(filename, variables=None): 

""" Return chunks associated with each variable 

 

Parameters 

---------- 

filename : str 

Filename of NetCDF file 

variables : Sequence 

Subset of variables to retrieve `chunking` for 

 

Returns 

------- 

Mapping[str, Mapping[str, int]] 

Mapping of variable names to chunks. Chunks are stored 

mapping dimension name to chunksize (e.g., ``{'x': 250}``) 

""" 

# Keep this import inside incase user doesn't have library 

# (e.g., with a minimal install of xarray) 

from netCDF4 import Dataset 

 

logger.debug(f'Opening "{filename}" as a `netCDF4.Dataset` to read ' 

'saved chunksizes') 

with Dataset(filename, mode='r') as nc: 

# Store info on each chunk: what vars use, and how many 

chunks = OrderedDict() 

variables = variables or nc.variables.keys() 

for name in variables: 

var = nc.variables[name] 

dims = var.dimensions 

chunking = var.chunking() 

if isinstance(chunking, list): 

chunking = OrderedDict(( 

(_dim, _chunk) for _dim, _chunk in 

zip(dims, chunking) 

if not _dim.startswith('string') 

)) 

else: 

chunking = None 

chunks[name] = chunking 

 

return chunks 

 

 

def read_chunks_rasterio(riods): 

""" Returns chunks for rasterio dataset formatted for xarray 

 

Parameters 

---------- 

riods : str, pathlib.Path, or rasterio.DatasetReader 

Rasterio dataset or path to dataset 

 

Returns 

------- 

dict 

Chunks as expected by xarray (e.g., ``{'x': 50, 'y': 50}``) 

""" 

if isinstance(riods, (str, Path, )): 

# Keep this import inside incase user doesn't have library 

# (e.g., with a minimal install of xarray) 

import rasterio 

# Open it for ourselves 

with rasterio.open(str(riods), 'r') as riods: 

return read_chunks_rasterio(riods) 

 

chunks = riods.block_shapes 

if len(set(chunks)) != 1: 

warnings.warn('Block shapes inconsistent across bands. ' 

'Using block shapes from first band') 

chunks = dict(y=chunks[0][0], x=chunks[0][1]) 

 

return chunks 

 

 

# ---------------------------------------------------------------------------- 

# Chunk heuristics 

def best_chunksizes(chunks, tiebreaker=max): 

"""Decide which chunksize to use for each dimension from variables 

 

Parameters 

---------- 

chunks : Mapping[str, Mapping[str, int]] 

Mapping of variable names to variable chunksizes 

tiebreaker : callable, optional 

Controls what chunksize should be used for a dimension in the event 

of a tie. For example, if 3 variables had a chunksize of 250 and 

another 3 had a chunksize of 500, the guess is determined by 

``callable([250, 500])``. By default, prefer the larger chunksize 

(i.e., :py:func:`max`) 

 

Returns 

------- 

dict 

Chunksize per dimension 

 

Examples 

-------- 

 

>>> chunks = { 

... 'blu': {'x': 5, 'y': 5}, 

... 'grn': {'x': 5, 'y': 10}, 

... 'red': {'x': 5, 'y': 5}, 

... 'ordinal': None 

} 

>>> best_chunksizes(chunks) 

{'x': 5, 'y': 5} 

 

""" 

# Collect all chunksizes as {dim: [chunksizes, ...]} 

dim_chunks = defaultdict(list) 

for var, var_chunks in chunks.items(): 

if var_chunks: 

# Guard if chunks/chunksizes 

dims = list(var_chunks.keys()) 

chunksizes = chunks_to_chunksizes(var_chunks) 

for dim, chunksize in zip(dims, chunksizes): 

dim_chunks[dim].append(chunksize) 

 

guess = {} 

for dim, chunksizes in dim_chunks.items(): 

# Use most frequently used chunksize 

counter = Counter(chunksizes) 

max_n = max(counter.values()) 

max_val = tuple(k for k, v in counter.items() if v == max_n) 

 

# If multiple, prefer biggest value (by default) 

if len(max_val) > 1: 

logger.debug('Multiple common chunksizes found. Breaking tie using' 

f'`{tiebreaker}`') 

pick = tiebreaker(max_val) 

else: 

pick = max_val[0] 

 

logger.debug(f'Guessing value "{pick}" for dim "{dim}"') 

guess[dim] = pick 

 

return guess 

 

 

def auto_determine_chunks(filename): 

""" Try to guess the best chunksizes for a filename 

 

Parameters 

---------- 

filename : str 

File to read 

 

Returns 

------- 

dict 

Best guess for chunksizes to use for each dimension 

""" 

try: 

var_chunks = read_chunks(str(filename)) 

except ValueError: 

logger.debug('"auto" chunk determination failed') 

chunks = None 

else: 

chunks = best_chunksizes(var_chunks) 

 

return chunks 

 

 

# ---------------------------------------------------------------------------- 

# Chunk format handling 

@singledispatch 

def get_chunksizes(xarr): 

""" Return the chunk sizes used for each dimension in `xarr` 

 

Parameters 

---------- 

xarr : xr.DataArray or xr.Dataset 

Chunked data 

 

Returns 

------- 

dict 

Dimensions (keys) and chunk sizes (values) 

 

Raises 

------ 

TypeError 

Raised if input is not a Dataset or DataArray 

""" 

raise TypeError('Input `xarr` must be an xarray Dataset or DataArray, ' 

f'not "{type(xarr)}"') 

 

 

@get_chunksizes.register(xr.DataArray) 

def _get_chunksizes_dataarray(xarr): 

if not xarr.chunks: 

return {} 

return OrderedDict(( 

(dim, xarr.chunks[i][0]) for i, dim in enumerate(xarr.dims) 

)) 

 

 

@get_chunksizes.register(xr.Dataset) 

def _get_chunksizes_dataset(xarr): 

if not xarr.chunks: 

return {} 

return OrderedDict(( 

(dim, chunks[0]) for dim, chunks in xarr.chunks.items() 

)) 

 

 

@singledispatch 

def chunks_to_chunksizes(data, dims=None): 

""" Convert an object to chunksizes (i.e., used in encoding) 

 

Parameters 

---------- 

data : xarray.DataArray, dict, or xarray.Dataset 

Input data containing chunk information 

dims : Sequence[str], optional 

Optionally, provide the order in which dimension chunksizes 

should be returned. Useful when asking for chunksizes from 

not-necessarily-ordered data (dicts and Datasets) 

 

Returns 

------- 

tuple 

Chunk sizes for each dimension. Returns an empty tuple if there 

are no chunks. 

""" 

raise TypeError(f'Unknown type for input ``data`` "{type(data)}"') 

 

 

_DICTS = (dict, xr.core.utils.FrozenOrderedDict) 

@register_multi_singledispatch(chunks_to_chunksizes, _DICTS) 

def _chunks_to_chunksizes_dict(data, dims=None): 

dims_ = dims or data.keys() 

return tuple(data[d] if isinstance(data[d], int) else data[d][0] 

for d in dims_) 

 

 

@chunks_to_chunksizes.register(xr.Dataset) 

def _chunks_to_chunksizes_dataset(data, dims=None): 

if not data.chunks: 

return () 

return _chunks_to_chunksizes_dict(data.chunks, dims=dims) 

 

 

@chunks_to_chunksizes.register(xr.DataArray) 

def _chunks_to_chunksizes_dataarray(data, dims=None): 

if not data.chunks: 

return () 

if dims: 

dim_idx = [data.dims.index(d) for d in dims] 

else: 

dim_idx = dims or range(len(data.chunks)) 

 

return tuple(data.chunks[i][0] for i in dim_idx)