Fixing dedupe

This commit is contained in:
2025-09-22 16:54:58 +02:00
parent e4ff804dfb
commit 0bed241a54
2 changed files with 125 additions and 8 deletions

View File

@@ -87,7 +87,7 @@ def parse_key_value_table(table_tag):
return data
def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
def write_single_csv(policies, output_csv='policies.csv', dedupe=False, dedupe_scope="exact", lineterminator='\n'):
"""
Writes a single CSV with columns in this order:
1) PolicyName
@@ -101,6 +101,11 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
Each row corresponds to one Setting.
If dedupe=True, exact duplicate rows (across all policies) are skipped.
`dedupe_scope` controls how duplicates are identified:
- 'exact' -> full row match (default)
- 'policy' -> (PolicyName, SettingKey, SettingValue)
- 'global' -> (SettingKey, SettingValue, Policy type, Platform supported)
- `lineterminator`: line ending to use when writing the CSV (default `\n`, use `\r\n` for Windows-style).
"""
# The exact order we want:
columns = [
@@ -114,12 +119,35 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
"Last modified"
]
def make_key(row_list):
if not dedupe:
return None
if dedupe_scope == "exact":
return tuple(row_list)
elif dedupe_scope == "policy":
# row_list layout: [PolicyName, Description, SettingKey, SettingValue, Policy type, Platform, Created, Last modified]
return (
row_list[0], # PolicyName
row_list[2], # SettingKey
row_list[3], # SettingValue
)
elif dedupe_scope == "global":
return (
row_list[2], # SettingKey
row_list[3], # SettingValue
row_list[4], # Policy type
row_list[5], # Platform supported
)
else:
# Fallback to exact if an unknown scope is provided
return tuple(row_list)
# De-duplication support (across the entire file)
seen_rows = set() if dedupe else None
rows_written = 0
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer = csv.writer(f, lineterminator=lineterminator)
# Write header
writer.writerow(columns)
@@ -149,7 +177,7 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
last_modified
]
if seen_rows is not None:
key = tuple(row)
key = make_key(row)
if key in seen_rows:
continue
seen_rows.add(key)
@@ -170,7 +198,7 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
last_modified
]
if seen_rows is not None:
key = tuple(row)
key = make_key(row)
if key in seen_rows:
continue
seen_rows.add(key)
@@ -208,6 +236,23 @@ def main():
"SettingKey, SettingValue, Policy type, Platform supported, Created, Last modified)."
),
)
parser.add_argument(
"--dedupe-scope",
choices=["exact", "policy", "global"],
default="exact",
help=(
"How to identify duplicates when --dedupe is set: 'exact' (full row), "
"'policy' (PolicyName+SettingKey+SettingValue), or 'global' (SettingKey+SettingValue+Policy type+Platform)."
),
)
parser.add_argument(
"--newline",
choices=["lf", "crlf"],
default="lf",
help=(
"Choose line endings for the output CSV: 'lf' (\\n, macOS/Linux) or 'crlf' (\\r\\n, Windows)."
),
)
args = parser.parse_args()
@@ -226,11 +271,23 @@ def main():
policies = parse_tables_from_markdown(input_path)
rows_written = write_single_csv(policies, output_csv, dedupe=args.dedupe)
lineterminator = "\n" if args.newline == "lf" else "\r\n"
msg = f"Done! Parsed {len(policies)} policies and wrote {rows_written} rows to '{output_csv}'."
rows_written = write_single_csv(
policies,
output_csv,
dedupe=args.dedupe,
dedupe_scope=args.dedupe_scope,
lineterminator=lineterminator,
)
msg = (
f"Done! Parsed {len(policies)} policies and wrote {rows_written} rows to '{output_csv}'. "
f"(newline={args.newline}"
)
if args.dedupe:
msg += " (duplicates removed)"
msg += f", dedupe={args.dedupe_scope}"
msg += ")"
print(msg)
if __name__ == "__main__":