From ceb76b0893b5a0cbfeab269d373b6bb656222b69 Mon Sep 17 00:00:00 2001 From: Guilhem Moulin Date: Wed, 19 Jun 2024 04:29:26 +0200 Subject: Add logic for field regex substitution. This is useful to replace a YYYYMMDD formatted date with YYYY-MM-DD. The target field can then be set to not-nullable and its type set to Date, as the OGR_F_SetField*() with take care of the conversion. We could also do that via an SQL query, but in our case the sources are not proper RDBMS so SQL is emulated anyway. --- config.yml | 134 ++++++++++++++++++++++++++++++++++------------------------ schema.sql | 16 +++---- webmap-import | 100 +++++++++++++++++++++++++++++++------------ 3 files changed, 161 insertions(+), 89 deletions(-) diff --git a/config.yml b/config.yml index 183599c..60b9a20 100644 --- a/config.yml +++ b/config.yml @@ -231,18 +231,23 @@ layers: # source_field_name1: field_name1 # source_field_name2: field_name2 # -# # Mapping of (source) field values, to replace particular -# # values with something else. Useful for instance to replace -# # the empty string with NULL. +# # Mapping of (source) field values, to replace literal values or values (fully) +# # matching a regular expression with something else. Useful for instance to +# # replace the empty string with NULL. Rules are tested in the order given and +# # stop at the first match. # value-map: # source_field_name1: -# - from: '' -# to: null -# - from: 'foo' -# to: 'bar' +# - replace: '' +# with: null +# #type: literal +# - replace: 'foo' +# with: 'bar' +# - replace: 'fo[a-z]' +# with: 'baz' +# type: regex # source_field_name2: -# - from: 0 -# to: 1 +# - replace: 0 +# with: 1 'lansyta': description: Sveriges län (Lantmäteriet) @@ -626,8 +631,8 @@ layers: Arealha: ArealHa value-map: Beteckn: - - from: 'Visas ej' - to: null + - replace: 'Visas ej' + with: null - source: download: @@ -661,8 +666,8 @@ layers: Arealha: ArealHa value-map: Beteckn: - - from: 'Visas ej' - to: null + - replace: 'Visas ej' + with: null - source: download: @@ -696,8 +701,8 @@ layers: Arealha: ArealHa value-map: Beteckn: - - from: 'Visas ej' - to: null + - replace: 'Visas ej' + with: null - source: download: @@ -729,8 +734,8 @@ layers: Arealha: ArealHa value-map: Beteckn: - - from: 'Visas ej' - to: null + - replace: 'Visas ej' + with: null 'st:betesomraden': source: @@ -784,9 +789,7 @@ layers: nullable: false width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -804,6 +807,11 @@ layers: 'Application date': ApplicationDate 'Diary nr': DiaryNr 'Last updated': LastUpdated + value-map: + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex 'mrr:bearbetningskoncessioner_approved': description: Bearbetningskoncessioner, beviljade (SGU) @@ -839,9 +847,7 @@ layers: unique: true width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -862,8 +868,12 @@ layers: 'Last updated': LastUpdated value-map: 'Diary nr': - - from: '' - to: null + - replace: '' + with: null + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex 'mrr:markanvisningar': description: Markanvisning till koncession (SGU) @@ -883,9 +893,7 @@ layers: unique: true width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -903,11 +911,15 @@ layers: 'Last updated': LastUpdated value-map: 'Decision date': - - from: '' - to: null + - replace: '' + with: null 'Diary nr': - - from: '' - to: null + - replace: '' + with: null + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex 'mrr:mineral_applied': description: Undersökningstillstånd, metallar och mineral, ansökta (SGU) @@ -939,9 +951,7 @@ layers: nullable: false width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -959,6 +969,11 @@ layers: 'Application date': ApplicationDate 'Diary nr': DiaryNr 'Last updated': LastUpdated + value-map: + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex 'mrr:mineral_approved': description: Undersökningstillstånd, metallar och mineral, beviljade (SGU) @@ -999,9 +1014,7 @@ layers: unique: true width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -1023,8 +1036,12 @@ layers: 'Last updated': LastUpdated value-map: 'Diary nr': - - from: '' - to: null + - replace: '' + with: null + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex # 'mrr:mineral_expired': # source: @@ -1083,9 +1100,7 @@ layers: nullable: false width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -1103,6 +1118,11 @@ layers: 'Application date': ApplicationDate 'Diary nr': DiaryNr 'Last updated': LastUpdated + value-map: + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex 'mrr:olja_gas_diamant_approved': description: Undersökningstillstånd, olja, gas och diamant, beviljade (SGU) @@ -1143,9 +1163,7 @@ layers: unique: true width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -1167,8 +1185,12 @@ layers: 'Last updated': LastUpdated value-map: 'Diary nr': - - from: '' - to: null + - replace: '' + with: null + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex 'mrr:torvkoncessioner': description: Torvkoncessioner (SGU) @@ -1203,9 +1225,7 @@ layers: unique: true width: 16 - name: LastUpdated - # TODO convert to Date - type: String - width: 10 + type: Date nullable: false #comment: datum för senaste uppdatering source: @@ -1226,8 +1246,12 @@ layers: 'Last updated': LastUpdated value-map: 'Mineral': - - from: '' - to: null + - replace: '' + with: null 'Diary nr': - - from: '' - to: null + - replace: '' + with: null + 'Last updated': + - replace: '([0-9]{4})([0-9]{2})([0-9]{2})' + with: '{0}-{1}-{2}' + type: regex diff --git a/schema.sql b/schema.sql index 6fbef97..6b4e773 100644 --- a/schema.sql +++ b/schema.sql @@ -170,7 +170,7 @@ CREATE TABLE postgis."mrr:bearbetningskoncessioner_applied" ( "Applicant" character varying(254) NOT NULL, "ApplicationDate" date NOT NULL, "DiaryNr" character(16) NOT NULL, - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); @@ -217,7 +217,7 @@ CREATE TABLE postgis."mrr:bearbetningskoncessioner_approved" ( "ValidFrom" date NOT NULL, "ValidTo" date NOT NULL, "DiaryNr" character(16), - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); @@ -261,7 +261,7 @@ CREATE TABLE postgis."mrr:markanvisningar" ( "Name" character varying(254) NOT NULL, "DecisionDate" date, "DiaryNr" character(16), - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); @@ -307,7 +307,7 @@ CREATE TABLE postgis."mrr:mineral_applied" ( "Applicant" character varying(254) NOT NULL, "ApplicationDate" date NOT NULL, "DiaryNr" character(16) NOT NULL, - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); @@ -355,7 +355,7 @@ CREATE TABLE postgis."mrr:mineral_approved" ( "ValidFrom" date NOT NULL, "ValidTo" date NOT NULL, "DiaryNr" character(16), - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); @@ -401,7 +401,7 @@ CREATE TABLE postgis."mrr:olja_gas_diamant_applied" ( "Applicant" character varying(254) NOT NULL, "ApplicationDate" date NOT NULL, "DiaryNr" character(16) NOT NULL, - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); @@ -449,7 +449,7 @@ CREATE TABLE postgis."mrr:olja_gas_diamant_approved" ( "ValidFrom" date NOT NULL, "ValidTo" date NOT NULL, "DiaryNr" character(16), - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); @@ -496,7 +496,7 @@ CREATE TABLE postgis."mrr:torvkoncessioner" ( "ValidFrom" date NOT NULL, "ValidTo" date NOT NULL, "DiaryNr" character(16), - "LastUpdated" character(10) NOT NULL, + "LastUpdated" date NOT NULL, wkb_geometry postgis.geometry(MultiPolygon,3006) ); diff --git a/webmap-import b/webmap-import index b97b275..731c49f 100755 --- a/webmap-import +++ b/webmap-import @@ -758,9 +758,16 @@ def setOutputFieldMap(defn, sources): for idx, rule in enumerate(rules): if rule is None or not isinstance(rule, dict): raise Exception(f'Field "{fldName}" has invalid rule #{idx}: {rule}') - if 'from' not in rule or 'to' not in rule or len(rule) != 2: + if 'type' not in rule: + ruleType = rule['type'] = 'literal' + else: + ruleType = rule['type'] + if ('replace' not in rule or 'with' not in rule or len(rule) != 3 or + ruleType is None or ruleType not in ['literal', 'regex']): raise Exception(f'Field "{fldName}" has invalid rule #{idx}: {rule}') - rules[idx] = ( rule['from'], rule['to'] ) + if ruleType == 'regex': + rule['replace'] = re.compile(rule['replace']) + rules[idx] = ( rule['replace'], rule['with'] ) # Escape the given identifier, cf. # swig/python/gdal-utils/osgeo_utils/samples/validate_gpkg.py:_esc_id() @@ -1002,8 +1009,8 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): else: logging.info('Source layer "%s" has %d features', layername, count0) - # build a list of pairs (field index, mapping_dict) - valueMapLiteral = [] + # build a list of triplets (field index, replacement_for_null, [(from_value, to_value), …]) + valueMap = [] for fldName, rules in args.get('value-map', {}).items(): i = defn.GetFieldIndex(fldName) if i < 0: @@ -1012,20 +1019,34 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): logging.warning('Ignored source field "%s" has value map', fldName) continue - h = {} + hasNullReplacement = False + nullReplacement = None + mapping = [] fld = defn.GetFieldDefn(i) for idx, (rFrom, rTo) in enumerate(rules): - # use fld for both from and to (the types must match, - # casting is not allowed in the mapping) - rFrom = setFieldMapValue(fld, idx, rFrom) - rTo = setFieldMapValue(fld, idx, rTo) - h[rFrom] = rTo - - if len(h) > 0: - valueMapLiteral.append((i, h)) + # use fld for both 'from' and 'to' (the types must match, casting is not allowed in the mapping) + if rFrom is None: + if hasNullReplacement: + logging.warning('Field "%s" has duplicate NULL replacement', fld.GetName()) + else: + setFieldMapValue(fld, idx, None) # validate NULL + rTo = setFieldMapValue(fld, idx, rTo) + hasNullReplacement = True + nullReplacement = rTo + elif isinstance(rFrom, re.Pattern): + # validate but keep the rFrom regex + setFieldMapValue(fld, idx, str(rFrom)) + rTo = setFieldMapValue(fld, idx, rTo) + mapping.append( (rFrom, rTo, 1) ) + else: + rFrom = setFieldMapValue(fld, idx, rFrom) + rTo = setFieldMapValue(fld, idx, rTo) + mapping.append( (rFrom, rTo, 0) ) - bValueMapLiteral = len(valueMapLiteral) > 0 + if nullReplacement is not None or len(mapping) > 0: + valueMap.append( (i, nullReplacement, mapping) ) + bValueMap = len(valueMap) > 0 defn = None defn_dst = lyr_dst.GetLayerDefn() @@ -1034,31 +1055,48 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): eGType_dst_HasM = ogr.GT_HasM(eGType_dst) dGeomIsUnknown = ogr.GT_Flatten(eGType_dst) == ogr.wkbUnknown + if bValueMap: + valueMapCounts = [0] * fieldCount + n = 0 mismatch = {} feature = lyr.GetNextFeature() while feature is not None: - if bValueMapLiteral: - for i, h in valueMapLiteral: + if bValueMap: + for i, nullReplacement, mapping in valueMap: if not feature.IsFieldSet(i): continue elif feature.IsFieldNull(i): - if None in h: - v = h[None] - if v is not None: - # replace NULL with non-NULL value - feature.SetField(i, v) + if nullReplacement is not None: + # replace NULL with non-NULL value + feature.SetField(i, nullReplacement) + valueMapCounts[i] += 1 continue v = feature.GetField(i) - if v in h: - v2 = h[v] - if v2 is None: + for rFrom, rTo, rType in mapping: + if rType == 0: + # literal + if v != rFrom: + continue + elif rType == 1: + # regex + m = rFrom.fullmatch(v) + if m is None: + continue + elif rTo is not None: + rTo = rTo.format(*m.groups()) + else: + raise Exception(str(rType)) + + if rTo is None: # replace non-NULL value with NULL feature.SetFieldNull(i) else: # replace non-NULL value with non-NULL value - feature.SetField(i, v2) + feature.SetField(i, rTo) + valueMapCounts[i] += 1 + break feature2 = ogr.Feature(defn_dst) feature2.SetFromWithMap(feature, False, fieldMap) @@ -1092,11 +1130,21 @@ def importSource2(lyr_dst, path, args={}, basedir=None, extent=None): n += 1 feature = lyr.GetNextFeature() + if bValueMap: + valueMapCounts = [ (lyr.GetLayerDefn().GetFieldDefn(i).GetName(), k) for i,k in enumerate(valueMapCounts) if k > 0 ] + lyr = None logging.info('Imported %d features from source layer "%s"', n, layername) + if bValueMap: + if len(valueMapCounts) > 0: + valueMapCounts = ', '.join([ str(k) + '× "' + n + '"' for n,k in valueMapCounts ]) + else: + valueMapCounts = '-' + logging.info('Field substitutions: %s', valueMapCounts) + if len(mismatch) > 0: - mismatches = [ str(n) + '× ' + ogr.GeometryTypeToName(t) + mismatches = [ str(n) + '× ' + ogr.GeometryTypeToName(t) for t,n in sorted(mismatch.items(), key=lambda x: x[1]) ] logging.info('Forced conversion to %s: %s', ogr.GeometryTypeToName(eGType_dst), ', '.join(mismatches)) -- cgit v1.2.3