# 📦 Importy
import json
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

# 🔧 Ścieżki
INPUT_PATH = Path("../data/labeled/labeled_dockerfiles.jsonl")
TOP_RULES_PATH = Path("../data/metadata/top_rules.json")

# 🧪 Inicjalizacja zmiennych
labels_counter = Counter()
rules_counter = Counter()
rules_per_file = []
lines_with_errors_per_file = []
line_positions = []
lengths = []
fixable_counter = 0
unique_rules_with_fixes = set()

# 📂 Wczytywanie danych
with open(INPUT_PATH, encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        labels_counter[obj["label"]] += 1

        if obj["label"] == "bad":
            rules = obj.get("rules_triggered", [])
            rules_counter.update(rules)
            rules_per_file.append(len(rules))

            fixes = obj.get("fix_suggestions", {})
            if fixes:
                fixable_counter += 1
                unique_rules_with_fixes.update(fixes.keys())

            lines = obj.get("lines", {}).values()
            lines_with_errors_per_file.append(len(set(lines)))
            line_positions.extend(lines)

        lengths.append(len(obj["content"]))

print("✅ Dane wczytane.")

✅ Dane wczytane.

# 📊 Statystyki ogólne
print(f"✅ Good: {labels_counter['good']}")
print(f"❌ Bad:  {labels_counter['bad']}")
print(f"🧩 Unikalne reguły: {len(rules_counter)}")
print(f"🛠 Plików z co najmniej jednym możliwym fixem: {fixable_counter}")
print(f"🔧 Reguły z przypisanym fixem: {len(unique_rules_with_fixes)}")

✅ Good: 1500
❌ Bad:  15000
🧩 Unikalne reguły: 127
🛠 Plików z co najmniej jednym możliwym fixem: 15000
🔧 Reguły z przypisanym fixem: 49

# 🏆 Top 30 reguł
top_rules = rules_counter.most_common(30)
for code, count in top_rules:
    print(f"{code}: {count}x")

with open(TOP_RULES_PATH, "w", encoding="utf-8") as f:
    json.dump([code for code, _ in top_rules], f, indent=2)

print(f"💾 Zapisano top 30 do {TOP_RULES_PATH}")

DL4006: 7895x
DL3008: 6372x
SC2086: 6334x
DL3003: 5372x
DL3015: 4657x
DL3047: 4625x
DL3009: 4286x
DL3004: 3437x
DL4001: 2776x
DL4000: 2759x
DL3059: 2552x
DL3018: 1926x
SC2016: 1817x
SC2046: 1709x
DL3006: 1614x
SC2028: 1480x
DL3027: 1444x
DL3020: 1098x
DL3025: 996x
DL3042: 981x
DL3013: 796x
DL3007: 793x
DL3033: 723x
SC2043: 703x
DL3019: 551x
DL3005: 407x
DL3002: 394x
DL3048: 348x
DL3045: 309x
DL3032: 293x
💾 Zapisano top 30 do ../data/metadata/top_rules.json

# 📏 Statystyki długości plików
lengths_np = np.array(lengths)
print(f"📏 Średnia długość: {lengths_np.mean():.2f}")
print(f"📏 Mediana długości: {np.median(lengths_np):.0f}")
print(f"📏 Min: {lengths_np.min()}, Max: {lengths_np.max()}")

📏 Średnia długość: 48.42
📏 Mediana długości: 47
📏 Min: 5, Max: 532

# 📊 Histogram liczby reguł na plik
plt.figure()
plt.hist(rules_per_file, bins=range(1, max(rules_per_file)+2), color="salmon", edgecolor="black")
plt.title("Liczba reguł naruszonych na plik")
plt.xlabel("Liczba reguł")
plt.ylabel("Liczba plików")
plt.grid(True)
plt.tight_layout()
plt.show()

# 📊 Histogram liczby błędnych linii na plik
plt.figure()
plt.hist(lines_with_errors_per_file, bins=range(1, max(lines_with_errors_per_file)+2), color="orchid", edgecolor="black")
plt.title("Liczba linii z błędami w pliku")
plt.xlabel("Liczba linii z błędami")
plt.ylabel("Liczba plików")
plt.grid(True)
plt.tight_layout()
plt.show()

# 📊 Histogram pozycji linii błędów
plt.figure()
plt.hist(line_positions, bins=50, color="gold", edgecolor="black")
plt.title("Rozkład pozycji błędów (linie)")
plt.xlabel("Numer linii")
plt.ylabel("Liczba błędów")
plt.grid(True)
plt.tight_layout()
plt.show()