import math import time import pandas as pd import scipy.stats as stats import matplotlib.pyplot as plt import matplotlib matplotlib.use('TkAgg') # Input Pfade JAVA_INPUT = './boa-output/try_Java_Jan_2022_last_revision.boa.output.txt' PYTHON_INPUT = './boa-output/try_Python_Feb_2022_last_revision.boa.output.txt' # Konfiguration von read_sample SEED = 1 SHOW_PLOTS = True SAMPLES = 1000 def read_sample(file_path, title='', sample_size=0, show=True): # filename kann auch ein URL sein: "https://..../example.csv" df = pd.read_csv(file_path, sep=r"\[|\]\s=", engine="python", index_col=False, #nrows=25, # zum Testen nur kleine Anzahl einlesen skipinitialspace=True, names=['Variable', 'Project', 'Ratio'], usecols=['Project', 'Ratio'] ) if sample_size > 0: # feste seed zum besseren Vergleich df = df.sample(sample_size, random_state=int(SEED)) if show: df.info() df.boxplot(column=['Ratio'], grid=False) df.hist(column=['Ratio'], grid=False) plt.title(title) plt.show() return df def cohen_d(x, y): # Mittelwerte der Gruppen mean_x, mean_y = x.mean(), y.mean() mean_diff = mean_x - mean_y # Varianzen der Gruppen var_x, var_y = x.var(ddof=1), y.var(ddof=1) # Stichprobengrößen der Gruppen size_x, size_y = len(x), len(y) # Gepoolte Standardabweichung pool_std = math.sqrt(((size_x - 1) * var_x + (size_y - 1) * var_y) / (size_x + size_y - 2)) # Cohen's d d_val = mean_diff / pool_std return d_val def find_barrier(x_sample_path, y_sample_path, alpha=0.1, lower_limit=2, upper_limit=10000): left = lower_limit right = upper_limit barrier_size = -1 # Binäre Suche nach der Schranke while left <= right: mid = (left + right) // 2 x_sample = read_sample(x_sample_path, sample_size=mid, show=False)["Ratio"] y_sample = read_sample(y_sample_path, sample_size=mid, show=False)["Ratio"] stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided") if p_value >= alpha: # Vermerke die aktuelle untere Schranke barrier_size = mid # Kein signifikantes Ergebnis gefunden, probiere größere Probe left = mid + 1 else: # Signifikantes Ergebnis gefunden, probiere kleinere Probe right = mid - 1 x_sample = read_sample(x_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"] y_sample = read_sample(y_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"] stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided") print(f"\nSind die Unterschiede bei {barrier_size + 1} Proben signifikant? {str(p_value < alpha)}") x_sample = read_sample(x_sample_path, sample_size=barrier_size, show=False)["Ratio"] y_sample = read_sample(y_sample_path, sample_size=barrier_size, show=False)["Ratio"] stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided") print(f"Sind die Unterschiede bei {barrier_size} Proben signifikant? {str(p_value < alpha)}") return barrier_size def main(): plt.close('all') print('Statistische Berechnungen zu Häufigkeiten (Übung 5)') print('\nEinlesen der ersten Stichprobe (Python)') python_sample = read_sample(PYTHON_INPUT, sample_size=SAMPLES, show=SHOW_PLOTS, title="Python") print('Mean:' + str(python_sample['Ratio'].mean())) print('Variance:' + str(python_sample['Ratio'].var())) print('\nEinlesen der zweiten Stichprobe (Java)') java_sample = read_sample(JAVA_INPUT, sample_size=SAMPLES, show=SHOW_PLOTS, title="Java") print('Mean:' + str(java_sample['Ratio'].mean())) print('Variance:' + str(java_sample['Ratio'].var())) print('\nStatistische Tests') # Aufgabenbearbeitung ab hier # Mann-Whitney-U-Test stat, p_value = stats.mannwhitneyu(python_sample['Ratio'], java_sample['Ratio'], alternative='two-sided') effect_size = cohen_d(python_sample['Ratio'], java_sample['Ratio']) print(f"Globaler Durchschnitt Python: {python_sample.get('Ratio').mean()}") print(f"Globaler Durchschnitt Java: {java_sample.get('Ratio').mean()}") print(f"Mann-Whitney-Test: Statistik {stat}, P-Wert {p_value}") print(f"Effektstärke (Cohen's d): {effect_size}") # Experimentelle Bestimmung der Schranke barrier = find_barrier(PYTHON_INPUT, JAVA_INPUT, alpha=0.01) print(f"Untere Schranke, ab welcher der Test nicht signifikant ist: {barrier}") # Press the green button in the gutter to run the script. if __name__ == '__main__': main()