Softwaretechnik-II/boa/analysis/aufgabe_02.py

import math
import time

import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

import matplotlib
matplotlib.use('TkAgg')

JAVA_INPUT = './data/try_Java_Jan_2022_last_revision.boa.output.txt'
PYTHON_INPUT = './data/try_Python_Feb_2022_last_revision.boa.output.txt'

SHOW_PLOTS = False
SEED = 3

def read_sample(file_path, title='', sample_size=0, show=True):
    # filename kann auch ein URL sein: "https://..../example.csv"
    df = pd.read_csv(file_path,
                     sep=r"\[|\]\s=",
                     engine="python",
                     index_col=False,
                     #nrows=25, # zum Testen nur kleine Anzahl einlesen
                     skipinitialspace=True,
                     names=['Variable', 'Project', 'Ratio'],
                     usecols=['Project', 'Ratio']
                     )

    if sample_size > 0:
        # feste seed zum besseren Vergleich
        df = df.sample(sample_size, random_state=int(SEED))

    if show:
        df.info()
        df.boxplot(column=['Ratio'], grid=False)
        df.hist(column=['Ratio'], grid=False)
        plt.title(title)
        plt.show()

    return df

def cohen_d(x, y):
    # Mittelwerte der Gruppen
    mean_x, mean_y = x.mean(), y.mean()
    mean_diff = mean_x - mean_y

    # Varianzen der Gruppen
    var_x, var_y = x.var(ddof=1), y.var(ddof=1)

    # Stichprobengrößen der Gruppen
    size_x, size_y = len(x), len(y)

    # Gepoolte Varianz und Standardabweichung
    pool_var = ((size_x - 1) * var_x + (size_y - 1) * var_y) / (size_x + size_y - 2)
    pool_var = math.sqrt(pool_var)

    # Cohen's d
    d_val = mean_diff / pool_var
    return d_val

def find_barrier(x_sample_path, y_sample_path, alpha=0.1, lower_limit=2, upper_limit=10000):
    left = lower_limit
    right = upper_limit
    barrier_size = -1

    # Binäre Suche nach der Schranke
    while left <= right:
        mid = (left + right) // 2
        x_sample = read_sample(x_sample_path, sample_size=mid, show=False)["Ratio"]
        y_sample = read_sample(y_sample_path, sample_size=mid, show=False)["Ratio"]
        stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")

        if p_value >= alpha:
            # Vermerke die aktuelle untere Schranke
            barrier_size = mid
            # Kein signifikantes Ergebnis gefunden, probiere größere Probe
            left = mid + 1
        else:
            # Signifikantes Ergebnis gefunden, probiere kleinere Probe
            right = mid - 1

    x_sample = read_sample(x_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]
    y_sample = read_sample(y_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]
    stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
    print(f"\nSind die Unterschiede bei {barrier_size + 1} Proben signifikant? {str(p_value < alpha)}")

    x_sample = read_sample(x_sample_path, sample_size=barrier_size, show=False)["Ratio"]
    y_sample = read_sample(y_sample_path, sample_size=barrier_size, show=False)["Ratio"]
    stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
    print(f"Sind die Unterschiede bei {barrier_size} Proben signifikant? {str(p_value < alpha)}")
    return barrier_size

def main():
    plt.close('all')
    print('Statistische Berechnungen zu Häufigkeiten (Übung 5)')

    print('\nEinlesen der ersten Stichprobe (Python)')
    python_sample = read_sample(PYTHON_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Python")
    print('Mean:' + str(python_sample['Ratio'].mean()))
    print('Variance:' + str(python_sample['Ratio'].var()))

    print('\nEinlesen der zweiten Stichprobe (Java)')
    java_sample = read_sample(JAVA_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Java")
    print('Mean:' + str(java_sample['Ratio'].mean()))
    print('Variance:' + str(java_sample['Ratio'].var()))

    print('\nStatistische Tests')
    # Aufgabenbearbeitung ab hier

    # Mann-Whitney-U-Test
    stat, p_value = stats.mannwhitneyu(python_sample['Ratio'], java_sample['Ratio'], alternative='two-sided')
    effect_size = cohen_d(python_sample['Ratio'], java_sample['Ratio'])

    print(f"Globaler Durchschnitt Python: {python_sample.get('Ratio').mean()}")
    print(f"Globaler Durchschnitt Java: {java_sample.get('Ratio').mean()}")

    print(f"Mann-Whitney-Test: Statistik {stat}, P-Wert {p_value}")
    print(f"Effektstärke (Cohen's d): {effect_size}")

    # Experimentelle Bestimmung der Schranke
    barrier = find_barrier(PYTHON_INPUT, JAVA_INPUT, alpha=0.01)
    print(f"Untere Schranke, ab welcher der Test nicht signifikant ist: {barrier}")

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    main()
continue homework 2025-01-11 00:48:27 +01:00			`import math`
			`import time`

			`import pandas as pd`
			`import scipy.stats as stats`
			`import matplotlib.pyplot as plt`

			`import matplotlib`
			`matplotlib.use('TkAgg')`

			`JAVA_INPUT = './data/try_Java_Jan_2022_last_revision.boa.output.txt'`
			`PYTHON_INPUT = './data/try_Python_Feb_2022_last_revision.boa.output.txt'`

			`SHOW_PLOTS = False`
			`SEED = 3`

			`def read_sample(file_path, title='', sample_size=0, show=True):`
			`# filename kann auch ein URL sein: "https://..../example.csv"`
			`df = pd.read_csv(file_path,`
			`sep=r"\[\|\]\s=",`
			`engine="python",`
			`index_col=False,`
			`#nrows=25, # zum Testen nur kleine Anzahl einlesen`
			`skipinitialspace=True,`
			`names=['Variable', 'Project', 'Ratio'],`
			`usecols=['Project', 'Ratio']`
			`)`

			`if sample_size > 0:`
			`# feste seed zum besseren Vergleich`
			`df = df.sample(sample_size, random_state=int(SEED))`

			`if show:`
			`df.info()`
			`df.boxplot(column=['Ratio'], grid=False)`
			`df.hist(column=['Ratio'], grid=False)`
			`plt.title(title)`
			`plt.show()`

			`return df`

			`def cohen_d(x, y):`
			`# Mittelwerte der Gruppen`
			`mean_x, mean_y = x.mean(), y.mean()`
			`mean_diff = mean_x - mean_y`

			`# Varianzen der Gruppen`
			`var_x, var_y = x.var(ddof=1), y.var(ddof=1)`

			`# Stichprobengrößen der Gruppen`
			`size_x, size_y = len(x), len(y)`

			`# Gepoolte Varianz und Standardabweichung`
			`pool_var = ((size_x - 1) * var_x + (size_y - 1) * var_y) / (size_x + size_y - 2)`
			`pool_var = math.sqrt(pool_var)`

			`# Cohen's d`
			`d_val = mean_diff / pool_var`
			`return d_val`

			`def find_barrier(x_sample_path, y_sample_path, alpha=0.1, lower_limit=2, upper_limit=10000):`
			`left = lower_limit`
			`right = upper_limit`
			`barrier_size = -1`

			`# Binäre Suche nach der Schranke`
			`while left <= right:`
			`mid = (left + right) // 2`
			`x_sample = read_sample(x_sample_path, sample_size=mid, show=False)["Ratio"]`
			`y_sample = read_sample(y_sample_path, sample_size=mid, show=False)["Ratio"]`
			`stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")`

			`if p_value >= alpha:`
			`# Vermerke die aktuelle untere Schranke`
			`barrier_size = mid`
			`# Kein signifikantes Ergebnis gefunden, probiere größere Probe`
			`left = mid + 1`
			`else:`
			`# Signifikantes Ergebnis gefunden, probiere kleinere Probe`
			`right = mid - 1`

			`x_sample = read_sample(x_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]`
			`y_sample = read_sample(y_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]`
			`stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")`
			`print(f"\nSind die Unterschiede bei {barrier_size + 1} Proben signifikant? {str(p_value < alpha)}")`

			`x_sample = read_sample(x_sample_path, sample_size=barrier_size, show=False)["Ratio"]`
			`y_sample = read_sample(y_sample_path, sample_size=barrier_size, show=False)["Ratio"]`
			`stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")`
			`print(f"Sind die Unterschiede bei {barrier_size} Proben signifikant? {str(p_value < alpha)}")`
			`return barrier_size`

			`def main():`
			`plt.close('all')`
			`print('Statistische Berechnungen zu Häufigkeiten (Übung 5)')`

			`print('\nEinlesen der ersten Stichprobe (Python)')`
			`python_sample = read_sample(PYTHON_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Python")`
			`print('Mean:' + str(python_sample['Ratio'].mean()))`
			`print('Variance:' + str(python_sample['Ratio'].var()))`

			`print('\nEinlesen der zweiten Stichprobe (Java)')`
			`java_sample = read_sample(JAVA_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Java")`
			`print('Mean:' + str(java_sample['Ratio'].mean()))`
			`print('Variance:' + str(java_sample['Ratio'].var()))`

			`print('\nStatistische Tests')`
			`# Aufgabenbearbeitung ab hier`

			`# Mann-Whitney-U-Test`
			`stat, p_value = stats.mannwhitneyu(python_sample['Ratio'], java_sample['Ratio'], alternative='two-sided')`
			`effect_size = cohen_d(python_sample['Ratio'], java_sample['Ratio'])`

			`print(f"Globaler Durchschnitt Python: {python_sample.get('Ratio').mean()}")`
			`print(f"Globaler Durchschnitt Java: {java_sample.get('Ratio').mean()}")`

			`print(f"Mann-Whitney-Test: Statistik {stat}, P-Wert {p_value}")`
			`print(f"Effektstärke (Cohen's d): {effect_size}")`

			`# Experimentelle Bestimmung der Schranke`
			`barrier = find_barrier(PYTHON_INPUT, JAVA_INPUT, alpha=0.01)`
			`print(f"Untere Schranke, ab welcher der Test nicht signifikant ist: {barrier}")`

			`# Press the green button in the gutter to run the script.`
			`if __name__ == '__main__':`
			`main()`