130 lines
4.6 KiB
Python
130 lines
4.6 KiB
Python
|
import math
|
||
|
import time
|
||
|
|
||
|
import pandas as pd
|
||
|
import scipy.stats as stats
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
import matplotlib
|
||
|
matplotlib.use('TkAgg')
|
||
|
|
||
|
JAVA_INPUT = './data/try_Java_Jan_2022_last_revision.boa.output.txt'
|
||
|
PYTHON_INPUT = './data/try_Python_Feb_2022_last_revision.boa.output.txt'
|
||
|
|
||
|
SHOW_PLOTS = False
|
||
|
SEED = 3
|
||
|
|
||
|
def read_sample(file_path, title='', sample_size=0, show=True):
|
||
|
# filename kann auch ein URL sein: "https://..../example.csv"
|
||
|
df = pd.read_csv(file_path,
|
||
|
sep=r"\[|\]\s=",
|
||
|
engine="python",
|
||
|
index_col=False,
|
||
|
#nrows=25, # zum Testen nur kleine Anzahl einlesen
|
||
|
skipinitialspace=True,
|
||
|
names=['Variable', 'Project', 'Ratio'],
|
||
|
usecols=['Project', 'Ratio']
|
||
|
)
|
||
|
|
||
|
if sample_size > 0:
|
||
|
# feste seed zum besseren Vergleich
|
||
|
df = df.sample(sample_size, random_state=int(SEED))
|
||
|
|
||
|
if show:
|
||
|
df.info()
|
||
|
df.boxplot(column=['Ratio'], grid=False)
|
||
|
df.hist(column=['Ratio'], grid=False)
|
||
|
plt.title(title)
|
||
|
plt.show()
|
||
|
|
||
|
return df
|
||
|
|
||
|
def cohen_d(x, y):
|
||
|
# Mittelwerte der Gruppen
|
||
|
mean_x, mean_y = x.mean(), y.mean()
|
||
|
mean_diff = mean_x - mean_y
|
||
|
|
||
|
# Varianzen der Gruppen
|
||
|
var_x, var_y = x.var(ddof=1), y.var(ddof=1)
|
||
|
|
||
|
# Stichprobengrößen der Gruppen
|
||
|
size_x, size_y = len(x), len(y)
|
||
|
|
||
|
# Gepoolte Varianz und Standardabweichung
|
||
|
pool_var = ((size_x - 1) * var_x + (size_y - 1) * var_y) / (size_x + size_y - 2)
|
||
|
pool_var = math.sqrt(pool_var)
|
||
|
|
||
|
# Cohen's d
|
||
|
d_val = mean_diff / pool_var
|
||
|
return d_val
|
||
|
|
||
|
def find_barrier(x_sample_path, y_sample_path, alpha=0.1, lower_limit=2, upper_limit=10000):
|
||
|
left = lower_limit
|
||
|
right = upper_limit
|
||
|
barrier_size = -1
|
||
|
|
||
|
# Binäre Suche nach der Schranke
|
||
|
while left <= right:
|
||
|
mid = (left + right) // 2
|
||
|
x_sample = read_sample(x_sample_path, sample_size=mid, show=False)["Ratio"]
|
||
|
y_sample = read_sample(y_sample_path, sample_size=mid, show=False)["Ratio"]
|
||
|
stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
|
||
|
|
||
|
if p_value >= alpha:
|
||
|
# Vermerke die aktuelle untere Schranke
|
||
|
barrier_size = mid
|
||
|
# Kein signifikantes Ergebnis gefunden, probiere größere Probe
|
||
|
left = mid + 1
|
||
|
else:
|
||
|
# Signifikantes Ergebnis gefunden, probiere kleinere Probe
|
||
|
right = mid - 1
|
||
|
|
||
|
x_sample = read_sample(x_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]
|
||
|
y_sample = read_sample(y_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]
|
||
|
stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
|
||
|
print(f"\nSind die Unterschiede bei {barrier_size + 1} Proben signifikant? {str(p_value < alpha)}")
|
||
|
|
||
|
x_sample = read_sample(x_sample_path, sample_size=barrier_size, show=False)["Ratio"]
|
||
|
y_sample = read_sample(y_sample_path, sample_size=barrier_size, show=False)["Ratio"]
|
||
|
stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
|
||
|
print(f"Sind die Unterschiede bei {barrier_size} Proben signifikant? {str(p_value < alpha)}")
|
||
|
return barrier_size
|
||
|
|
||
|
def main():
|
||
|
plt.close('all')
|
||
|
print('Statistische Berechnungen zu Häufigkeiten (Übung 5)')
|
||
|
|
||
|
print('\nEinlesen der ersten Stichprobe (Python)')
|
||
|
python_sample = read_sample(PYTHON_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Python")
|
||
|
print('Mean:' + str(python_sample['Ratio'].mean()))
|
||
|
print('Variance:' + str(python_sample['Ratio'].var()))
|
||
|
|
||
|
print('\nEinlesen der zweiten Stichprobe (Java)')
|
||
|
java_sample = read_sample(JAVA_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Java")
|
||
|
print('Mean:' + str(java_sample['Ratio'].mean()))
|
||
|
print('Variance:' + str(java_sample['Ratio'].var()))
|
||
|
|
||
|
print('\nStatistische Tests')
|
||
|
# Aufgabenbearbeitung ab hier
|
||
|
|
||
|
# Mann-Whitney-U-Test
|
||
|
stat, p_value = stats.mannwhitneyu(python_sample['Ratio'], java_sample['Ratio'], alternative='two-sided')
|
||
|
effect_size = cohen_d(python_sample['Ratio'], java_sample['Ratio'])
|
||
|
|
||
|
print(f"Globaler Durchschnitt Python: {python_sample.get('Ratio').mean()}")
|
||
|
print(f"Globaler Durchschnitt Java: {java_sample.get('Ratio').mean()}")
|
||
|
|
||
|
print(f"Mann-Whitney-Test: Statistik {stat}, P-Wert {p_value}")
|
||
|
print(f"Effektstärke (Cohen's d): {effect_size}")
|
||
|
|
||
|
# Experimentelle Bestimmung der Schranke
|
||
|
barrier = find_barrier(PYTHON_INPUT, JAVA_INPUT, alpha=0.01)
|
||
|
print(f"Untere Schranke, ab welcher der Test nicht signifikant ist: {barrier}")
|
||
|
|
||
|
# Press the green button in the gutter to run the script.
|
||
|
if __name__ == '__main__':
|
||
|
main()
|
||
|
|
||
|
|
||
|
|