Adding dedupe

This commit is contained in:
2025-09-22 16:24:26 +02:00
parent 35b297a91a
commit e4ff804dfb

View File

@@ -87,7 +87,7 @@ def parse_key_value_table(table_tag):
return data return data
def write_single_csv(policies, output_csv='policies.csv'): def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
""" """
Writes a single CSV with columns in this order: Writes a single CSV with columns in this order:
1) PolicyName 1) PolicyName
@@ -100,6 +100,7 @@ def write_single_csv(policies, output_csv='policies.csv'):
8) Last modified 8) Last modified
Each row corresponds to one Setting. Each row corresponds to one Setting.
If dedupe=True, exact duplicate rows (across all policies) are skipped.
""" """
# The exact order we want: # The exact order we want:
columns = [ columns = [
@@ -113,6 +114,10 @@ def write_single_csv(policies, output_csv='policies.csv'):
"Last modified" "Last modified"
] ]
# De-duplication support (across the entire file)
seen_rows = set() if dedupe else None
rows_written = 0
with open(output_csv, 'w', newline='', encoding='utf-8') as f: with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f) writer = csv.writer(f)
# Write header # Write header
@@ -143,7 +148,13 @@ def write_single_csv(policies, output_csv='policies.csv'):
created, created,
last_modified last_modified
] ]
if seen_rows is not None:
key = tuple(row)
if key in seen_rows:
continue
seen_rows.add(key)
writer.writerow(row) writer.writerow(row)
rows_written += 1
continue continue
# Otherwise, write one row per setting # Otherwise, write one row per setting
@@ -158,8 +169,15 @@ def write_single_csv(policies, output_csv='policies.csv'):
created, created,
last_modified last_modified
] ]
if seen_rows is not None:
key = tuple(row)
if key in seen_rows:
continue
seen_rows.add(key)
writer.writerow(row) writer.writerow(row)
rows_written += 1
return rows_written
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@@ -182,6 +200,14 @@ def main():
), ),
default=None, default=None,
) )
parser.add_argument(
"--dedupe",
action="store_true",
help=(
"Drop exact duplicate rows in the output (by the full row: PolicyName, Description, "
"SettingKey, SettingValue, Policy type, Platform supported, Created, Last modified)."
),
)
args = parser.parse_args() args = parser.parse_args()
@@ -200,17 +226,12 @@ def main():
policies = parse_tables_from_markdown(input_path) policies = parse_tables_from_markdown(input_path)
# Count rows that will be written (one per setting, or one if no settings) rows_written = write_single_csv(policies, output_csv, dedupe=args.dedupe)
row_count = 0
for p in policies:
settings = p.get("settings") or {}
row_count += max(len(settings), 1)
write_single_csv(policies, output_csv) msg = f"Done! Parsed {len(policies)} policies and wrote {rows_written} rows to '{output_csv}'."
if args.dedupe:
print( msg += " (duplicates removed)"
f"Done! Parsed {len(policies)} policies and wrote {row_count} rows to '{output_csv}'." print(msg)
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()