IntunePolicyParser/parseIntuneSettingsMarkdown.py

import csv
from bs4 import BeautifulSoup
import argparse
import os

def parse_tables_from_markdown(md_file_path):
    """
    Parses a Markdown/HTML file containing <h3> headings and 'table-settings' tables.

    Assumes each policy:
      - starts with an <h3> tag that has the policy name
      - is followed by two <table class='table-settings'> elements:
         1) 'Basics'
         2) 'Settings'

    Returns a list of dicts, each with:
      {
        'basic_info': { key -> value, ... },
        'settings': { key -> value, ... }
      }
    """
    with open(md_file_path, 'r', encoding='utf-8') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'lxml')
    policies = []

    # Find all <h3> tags, each is a policy heading
    h3_tags = soup.find_all('h3')

    for h3 in h3_tags:
        policy_name = h3.get_text(strip=True)

        # Look for the next two 'table-settings' tables (Basics and Settings)
        policy_tables = []
        sibling = h3.next_sibling
        while sibling and len(policy_tables) < 2:
            if (
                sibling.name == 'table' and
                'table-settings' in sibling.get('class', [])
            ):
                policy_tables.append(sibling)
            sibling = sibling.next_sibling

        # If fewer than 2 tables, skip this policy
        if len(policy_tables) < 2:
            continue

        basics_table = policy_tables[0]
        settings_table = policy_tables[1]

        # Parse out the Basic Info and Settings
        basic_info = parse_key_value_table(basics_table)
        settings_info = parse_key_value_table(settings_table)

        # Put policy name into basic_info if not already present
        basic_info.setdefault("PolicyName", policy_name)

        policies.append({
            'basic_info': basic_info,
            'settings': settings_info,
        })

    return policies

def parse_key_value_table(table_tag):
    """
    Given a <table> with class 'table-settings', parse each row (excluding
    headers) into a { key: value } dict, where each row is <td>Key</td><td>Value</td>.
    """
    data = {}
    rows = table_tag.find_all('tr', recursive=False)

    for row in rows:
        # Skip table header and category rows
        row_classes = row.get('class', [])
        if 'table-header1' in row_classes or 'category-level1' in row_classes:
            continue

        cols = row.find_all('td', recursive=False)
        if len(cols) < 2:
            continue  # can't parse a key-value from this row

        key_text = cols[0].get_text(strip=True)
        val_text = cols[1].get_text(strip=True)
        data[key_text] = val_text

    return data

def write_single_csv(policies, output_csv='policies.csv', dedupe=False, dedupe_scope="exact", lineterminator='\n'):
    """
    Writes a single CSV with columns in this order:
      1) PolicyName
      2) Description
      3) SettingKey
      4) SettingValue
      5) Policy type (mapped from 'Profile type')
      6) Platform supported
      7) Created
      8) Last modified

    Each row corresponds to one Setting.
    If dedupe=True, exact duplicate rows (across all policies) are skipped.
    `dedupe_scope` controls how duplicates are identified:
    - 'exact'  -> full row match (default)
    - 'policy' -> (PolicyName, SettingKey, SettingValue)
    - 'global' -> (SettingKey, SettingValue, Policy type, Platform supported)
    - `lineterminator`: line ending to use when writing the CSV (default `\n`, use `\r\n` for Windows-style).
    """
    # The exact order we want:
    columns = [
        "PolicyName",
        "Description",
        "SettingKey",
        "SettingValue",
        "Policy type",
        "Platform supported",
        "Created",
        "Last modified"
    ]

    def make_key(row_list):
        if not dedupe:
            return None
        if dedupe_scope == "exact":
            return tuple(row_list)
        elif dedupe_scope == "policy":
            # row_list layout: [PolicyName, Description, SettingKey, SettingValue, Policy type, Platform, Created, Last modified]
            return (
                row_list[0],  # PolicyName
                row_list[2],  # SettingKey
                row_list[3],  # SettingValue
            )
        elif dedupe_scope == "global":
            return (
                row_list[2],  # SettingKey
                row_list[3],  # SettingValue
                row_list[4],  # Policy type
                row_list[5],  # Platform supported
            )
        else:
            # Fallback to exact if an unknown scope is provided
            return tuple(row_list)

    # De-duplication support (across the entire file)
    seen_rows = set() if dedupe else None
    rows_written = 0

    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, lineterminator=lineterminator)
        # Write header
        writer.writerow(columns)

        for policy in policies:
            basic_info = policy['basic_info']
            settings = policy['settings']

            # Extract the relevant basic info fields
            policy_name = basic_info.get("PolicyName", "")
            description = basic_info.get("Description", "")
            # The user wants "Policy type" in CSV, but it's "Profile type" in the data
            policy_type = basic_info.get("Profile type", "")
            platform_supported = basic_info.get("Platform supported", "")
            created = basic_info.get("Created", "")
            last_modified = basic_info.get("Last modified", "")

            # If a policy has no settings, we could still write one row with empty SettingKey/Value
            if not settings:
                row = [
                    policy_name,
                    description,
                    "",  # SettingKey
                    "",  # SettingValue
                    policy_type,
                    platform_supported,
                    created,
                    last_modified
                ]
                if seen_rows is not None:
                    key = make_key(row)
                    if key in seen_rows:
                        continue
                    seen_rows.add(key)
                writer.writerow(row)
                rows_written += 1
                continue

            # Otherwise, write one row per setting
            for setting_key, setting_value in settings.items():
                row = [
                    policy_name,
                    description,
                    setting_key,
                    setting_value,
                    policy_type,
                    platform_supported,
                    created,
                    last_modified
                ]
                if seen_rows is not None:
                    key = make_key(row)
                    if key in seen_rows:
                        continue
                    seen_rows.add(key)
                writer.writerow(row)
                rows_written += 1

    return rows_written

def main():
    parser = argparse.ArgumentParser(
        description=(
            "Parse an Intune Markdown/HTML export with <h3> headings and two "
            "<table class='table-settings'> sections (Basics + Settings) into a flat CSV."
        )
    )
    parser.add_argument(
        "input",
        nargs="?",
        help="Path to the Markdown/HTML file to parse (default: cqre.md)",
        default=None,
    )
    parser.add_argument(
        "-o", "--output",
        help=(
            "Path to output CSV file. If not provided, derives from input name "
            "(e.g., input.md -> input.csv). If no input is given, defaults to policies-cqre.csv."
        ),
        default=None,
    )
    parser.add_argument(
        "--dedupe",
        action="store_true",
        help=(
            "Drop exact duplicate rows in the output (by the full row: PolicyName, Description, "
            "SettingKey, SettingValue, Policy type, Platform supported, Created, Last modified)."
        ),
    )
    parser.add_argument(
        "--dedupe-scope",
        choices=["exact", "policy", "global"],
        default="exact",
        help=(
            "How to identify duplicates when --dedupe is set: 'exact' (full row), "
            "'policy' (PolicyName+SettingKey+SettingValue), or 'global' (SettingKey+SettingValue+Policy type+Platform)."
        ),
    )
    parser.add_argument(
        "--newline",
        choices=["lf", "crlf"],
        default="lf",
        help=(
            "Choose line endings for the output CSV: 'lf' (\\n, macOS/Linux) or 'crlf' (\\r\\n, Windows)."
        ),
    )

    args = parser.parse_args()

    # Determine input path (keeps previous default behavior if none provided)
    input_path = args.input or "cqre.md"

    # Determine output path
    if args.output:
        output_csv = args.output
    else:
        if args.input:
            base = os.path.splitext(os.path.basename(input_path))[0]
            output_csv = f"{base}.csv"
        else:
            output_csv = "policies-cqre.csv"

    policies = parse_tables_from_markdown(input_path)

    lineterminator = "\n" if args.newline == "lf" else "\r\n"

    rows_written = write_single_csv(
        policies,
        output_csv,
        dedupe=args.dedupe,
        dedupe_scope=args.dedupe_scope,
        lineterminator=lineterminator,
    )

    msg = (
        f"Done! Parsed {len(policies)} policies and wrote {rows_written} rows to '{output_csv}'. "
        f"(newline={args.newline}"
    )
    if args.dedupe:
        msg += f", dedupe={args.dedupe_scope}"
    msg += ")"
    print(msg)

if __name__ == "__main__":
    main()