From ceb76b0893b5a0cbfeab269d373b6bb656222b69 Mon Sep 17 00:00:00 2001 From: Guilhem Moulin Date: Wed, 19 Jun 2024 04:29:26 +0200 Subject: Add logic for field regex substitution. This is useful to replace a YYYYMMDD formatted date with YYYY-MM-DD. The target field can then be set to not-nullable and its type set to Date, as the OGR_F_SetField*() with take care of the conversion. We could also do that via an SQL query, but in our case the sources are not proper RDBMS so SQL is emulated anyway. --- webmap-import | 100 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 74 insertions(+), 26 deletions(-) (limited to 'webmap-import') diff --git a/webmap-import b/webmap-import index b97b275..731c49f 100755 --- a/webmap-import +++ b/webmap-import @@ -758,9 +758,16 @@ def setOutputFieldMap(defn, sources): for idx, rule in enumerate(rules): if rule is None or not isinstance(rule, dict): raise Exception(f'Field "{fldName}" has invalid rule #{idx}: {rule}') - if 'from' not in rule or 'to' not in rule or len(rule) != 2: + if 'type' not in rule: + ruleType = rule['type'] = 'literal' + else: + ruleType = rule['type'] + if ('replace' not in rule or 'with' not in rule or len(rule) != 3 or + ruleType is None or ruleType not in ['literal', 'regex']): raise Exception(f'Field "{fldName}" has invalid rule #{idx}: {rule}') - rules[idx] = ( rule['from'], rule['to'] ) + if ruleType == 'regex': + rule['replace'] = re.compile(rule['replace']) + rules[idx] = ( rule['replace'], rule['with'] ) # Escape the given identifier, cf. # swig/python/gdal-utils/osgeo_utils/samples/validate_gpkg.py:_esc_id() @@ -1002,8 +1009,8 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): else: logging.info('Source layer "%s" has %d features', layername, count0) - # build a list of pairs (field index, mapping_dict) - valueMapLiteral = [] + # build a list of triplets (field index, replacement_for_null, [(from_value, to_value), …]) + valueMap = [] for fldName, rules in args.get('value-map', {}).items(): i = defn.GetFieldIndex(fldName) if i < 0: @@ -1012,20 +1019,34 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): logging.warning('Ignored source field "%s" has value map', fldName) continue - h = {} + hasNullReplacement = False + nullReplacement = None + mapping = [] fld = defn.GetFieldDefn(i) for idx, (rFrom, rTo) in enumerate(rules): - # use fld for both from and to (the types must match, - # casting is not allowed in the mapping) - rFrom = setFieldMapValue(fld, idx, rFrom) - rTo = setFieldMapValue(fld, idx, rTo) - h[rFrom] = rTo - - if len(h) > 0: - valueMapLiteral.append((i, h)) + # use fld for both 'from' and 'to' (the types must match, casting is not allowed in the mapping) + if rFrom is None: + if hasNullReplacement: + logging.warning('Field "%s" has duplicate NULL replacement', fld.GetName()) + else: + setFieldMapValue(fld, idx, None) # validate NULL + rTo = setFieldMapValue(fld, idx, rTo) + hasNullReplacement = True + nullReplacement = rTo + elif isinstance(rFrom, re.Pattern): + # validate but keep the rFrom regex + setFieldMapValue(fld, idx, str(rFrom)) + rTo = setFieldMapValue(fld, idx, rTo) + mapping.append( (rFrom, rTo, 1) ) + else: + rFrom = setFieldMapValue(fld, idx, rFrom) + rTo = setFieldMapValue(fld, idx, rTo) + mapping.append( (rFrom, rTo, 0) ) - bValueMapLiteral = len(valueMapLiteral) > 0 + if nullReplacement is not None or len(mapping) > 0: + valueMap.append( (i, nullReplacement, mapping) ) + bValueMap = len(valueMap) > 0 defn = None defn_dst = lyr_dst.GetLayerDefn() @@ -1034,31 +1055,48 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): eGType_dst_HasM = ogr.GT_HasM(eGType_dst) dGeomIsUnknown = ogr.GT_Flatten(eGType_dst) == ogr.wkbUnknown + if bValueMap: + valueMapCounts = [0] * fieldCount + n = 0 mismatch = {} feature = lyr.GetNextFeature() while feature is not None: - if bValueMapLiteral: - for i, h in valueMapLiteral: + if bValueMap: + for i, nullReplacement, mapping in valueMap: if not feature.IsFieldSet(i): continue elif feature.IsFieldNull(i): - if None in h: - v = h[None] - if v is not None: - # replace NULL with non-NULL value - feature.SetField(i, v) + if nullReplacement is not None: + # replace NULL with non-NULL value + feature.SetField(i, nullReplacement) + valueMapCounts[i] += 1 continue v = feature.GetField(i) - if v in h: - v2 = h[v] - if v2 is None: + for rFrom, rTo, rType in mapping: + if rType == 0: + # literal + if v != rFrom: + continue + elif rType == 1: + # regex + m = rFrom.fullmatch(v) + if m is None: + continue + elif rTo is not None: + rTo = rTo.format(*m.groups()) + else: + raise Exception(str(rType)) + + if rTo is None: # replace non-NULL value with NULL feature.SetFieldNull(i) else: # replace non-NULL value with non-NULL value - feature.SetField(i, v2) + feature.SetField(i, rTo) + valueMapCounts[i] += 1 + break feature2 = ogr.Feature(defn_dst) feature2.SetFromWithMap(feature, False, fieldMap) @@ -1092,11 +1130,21 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): n += 1 feature = lyr.GetNextFeature() + if bValueMap: + valueMapCounts = [ (lyr.GetLayerDefn().GetFieldDefn(i).GetName(), k) for i,k in enumerate(valueMapCounts) if k > 0 ] + lyr = None logging.info('Imported %d features from source layer "%s"', n, layername) + if bValueMap: + if len(valueMapCounts) > 0: + valueMapCounts = ', '.join([ str(k) + '× "' + n + '"' for n,k in valueMapCounts ]) + else: + valueMapCounts = '-' + logging.info('Field substitutions: %s', valueMapCounts) + if len(mismatch) > 0: - mismatches = [ str(n) + '× ' + ogr.GeometryTypeToName(t) + mismatches = [ str(n) + '× ' + ogr.GeometryTypeToName(t) for t,n in sorted(mismatch.items(), key=lambda x: x[1]) ] logging.info('Forced conversion to %s: %s', ogr.GeometryTypeToName(eGType_dst), ', '.join(mismatches)) -- cgit v1.2.3