Fixing dedupe

2025-09-22 16:54:58 +02:00
parent e4ff804dfb
commit 0bed241a54
2 changed files with 125 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,63 @@
 # IntunePolicyParser
-Parsing policies from Intune to Excel
+
 A utility to parse **Markdown/HTML documentation exports** from the [IntuneManagement](https://github.com/...) tool into flat CSV files that can be further analyzed in Excel or Power BI.
 ## Source files
 This parser expects as input a **Documentation export** from the IntuneManagement tool. Use the `Export Documentation (Markdown)` feature, which produces a `.md` file containing:
 - `<h3>` sections for each policy,
 - two `<table class="table-settings">` blocks under each heading (Basics + Settings).
 ## Usage
 ```bash
 python intune.py [input_file.md] [-o OUTPUT.csv] [--dedupe] [--dedupe-scope {exact,policy,global}] [--newline {lf,crlf}]
 ```
 ### Arguments
 - `input_file.md`  
  Path to the Intune documentation export (Markdown/HTML).  
  Defaults to `cqre.md`.
 - `-o, --output OUTPUT.csv`  
  Output CSV file. If not provided, the name is derived from the input (e.g. `input.md -> input.csv`).
 - `--dedupe`  
  Enable removal of duplicate rows.
 - `--dedupe-scope {exact,policy,global}`  
  How duplicates are identified:  
  - `exact` → full row must match (default).  
  - `policy` → unique per (PolicyName + SettingKey + SettingValue).  
  - `global` → unique per (SettingKey + SettingValue + Policy type + Platform).
 - `--newline {lf,crlf}`  
  Choose line endings for the output CSV:  
  - `lf` (Unix/macOS/Linux, default)  
  - `crlf` (Windows)
 ## Output format
 The generated CSV contains one row per policy setting with these columns:
 - `PolicyName`  
 - `Description`  
 - `SettingKey`  
 - `SettingValue`  
 - `Policy type`  
 - `Platform supported`  
 - `Created`  
 - `Last modified`  
 - `Scope tags`
 ## Example
 ```bash
 # Parse and dedupe by policy, exporting with LF line endings
 python intune.py CQRE.NET-2025-09-22.md -o CQRE.NET-2025-09-22.csv --dedupe --dedupe-scope policy --newline lf
 ```
 This creates `CQRE.NET-2025-09-22.csv` ready for analysis in Excel.
--- a/intune.py
+++ b/intune.py
@@ -87,7 +87,7 @@ def parse_key_value_table(table_tag):
    return data
-def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
+def write_single_csv(policies, output_csv='policies.csv', dedupe=False, dedupe_scope="exact", lineterminator='\n'):
    """
    Writes a single CSV with columns in this order:
      1) PolicyName
@@ -101,6 +101,11 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
    Each row corresponds to one Setting.
    If dedupe=True, exact duplicate rows (across all policies) are skipped.
    `dedupe_scope` controls how duplicates are identified:
    - 'exact'  -> full row match (default)
    - 'policy' -> (PolicyName, SettingKey, SettingValue)
    - 'global' -> (SettingKey, SettingValue, Policy type, Platform supported)
    - `lineterminator`: line ending to use when writing the CSV (default `\n`, use `\r\n` for Windows-style).
    """
    # The exact order we want:
    columns = [
@@ -114,12 +119,35 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
        "Last modified"
    ]
    def make_key(row_list):
        if not dedupe:
            return None
        if dedupe_scope == "exact":
            return tuple(row_list)
        elif dedupe_scope == "policy":
            # row_list layout: [PolicyName, Description, SettingKey, SettingValue, Policy type, Platform, Created, Last modified]
            return (
                row_list[0],  # PolicyName
                row_list[2],  # SettingKey
                row_list[3],  # SettingValue
            )
        elif dedupe_scope == "global":
            return (
                row_list[2],  # SettingKey
                row_list[3],  # SettingValue
                row_list[4],  # Policy type
                row_list[5],  # Platform supported
            )
        else:
            # Fallback to exact if an unknown scope is provided
            return tuple(row_list)
    # De-duplication support (across the entire file)
    seen_rows = set() if dedupe else None
    rows_written = 0
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
-        writer = csv.writer(f)
+        writer = csv.writer(f, lineterminator=lineterminator)
        # Write header
        writer.writerow(columns)
@@ -149,7 +177,7 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
                    last_modified
                ]
                if seen_rows is not None:
-                    key = tuple(row)
+                    key = make_key(row)
                    if key in seen_rows:
                        continue
                    seen_rows.add(key)
@@ -170,7 +198,7 @@ def write_single_csv(policies, output_csv='policies.csv', dedupe=False):
                    last_modified
                ]
                if seen_rows is not None:
-                    key = tuple(row)
+                    key = make_key(row)
                    if key in seen_rows:
                        continue
                    seen_rows.add(key)
@@ -208,6 +236,23 @@ def main():
            "SettingKey, SettingValue, Policy type, Platform supported, Created, Last modified)."
        ),
    )
    parser.add_argument(
        "--dedupe-scope",
        choices=["exact", "policy", "global"],
        default="exact",
        help=(
            "How to identify duplicates when --dedupe is set: 'exact' (full row), "
            "'policy' (PolicyName+SettingKey+SettingValue), or 'global' (SettingKey+SettingValue+Policy type+Platform)."
        ),
    )
    parser.add_argument(
        "--newline",
        choices=["lf", "crlf"],
        default="lf",
        help=(
            "Choose line endings for the output CSV: 'lf' (\\n, macOS/Linux) or 'crlf' (\\r\\n, Windows)."
        ),
    )
    args = parser.parse_args()
@@ -226,11 +271,23 @@ def main():
    policies = parse_tables_from_markdown(input_path)
-    rows_written = write_single_csv(policies, output_csv, dedupe=args.dedupe)
+    lineterminator = "\n" if args.newline == "lf" else "\r\n"
-    msg = f"Done! Parsed {len(policies)} policies and wrote {rows_written} rows to '{output_csv}'."
+    rows_written = write_single_csv(
        policies,
        output_csv,
        dedupe=args.dedupe,
        dedupe_scope=args.dedupe_scope,
        lineterminator=lineterminator,
    )
    msg = (
        f"Done! Parsed {len(policies)} policies and wrote {rows_written} rows to '{output_csv}'. "
        f"(newline={args.newline}"
    )
    if args.dedupe:
-        msg += " (duplicates removed)"
+        msg += f", dedupe={args.dedupe_scope}"
    msg += ")"
    print(msg)
 if __name__ == "__main__":