continue homework

2025-01-11 00:48:27 +01:00
parent 38709f0de0
commit bd30888c3f
16 changed files with 198296 additions and 207229 deletions
--- a/boa/analysis/.idea/.gitignore
+++ b/boa/analysis/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/boa/analysis/.idea/analysis.iml
+++ b/boa/analysis/.idea/analysis.iml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.13 (analysis)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/boa/analysis/.idea/inspectionProfiles/Project_Default.xml
+++ b/boa/analysis/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,12 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/boa/analysis/.idea/inspectionProfiles/profiles_settings.xml
+++ b/boa/analysis/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/boa/analysis/.idea/misc.xml
+++ b/boa/analysis/.idea/misc.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (analysis)" />
+  </component>
+</project>
--- a/boa/analysis/.idea/modules.xml
+++ b/boa/analysis/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/analysis.iml" filepath="$PROJECT_DIR$/.idea/analysis.iml" />
+    </modules>
+  </component>
+</project>
--- a/boa/analysis/.idea/vcs.xml
+++ b/boa/analysis/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
+  </component>
+</project>
--- a/boa/analysis/aufgabe_02.py
+++ b/boa/analysis/aufgabe_02.py
@@ -0,0 +1,129 @@
+import math
+import time
+
+import pandas as pd
+import scipy.stats as stats
+import matplotlib.pyplot as plt
+
+import matplotlib
+matplotlib.use('TkAgg')
+
+JAVA_INPUT = './data/try_Java_Jan_2022_last_revision.boa.output.txt'
+PYTHON_INPUT = './data/try_Python_Feb_2022_last_revision.boa.output.txt'
+
+SHOW_PLOTS = False
+SEED = 3
+
+def read_sample(file_path, title='', sample_size=0, show=True):
+    # filename kann auch ein URL sein: "https://..../example.csv"
+    df = pd.read_csv(file_path,
+                     sep=r"\[|\]\s=",
+                     engine="python",
+                     index_col=False,
+                     #nrows=25, # zum Testen nur kleine Anzahl einlesen
+                     skipinitialspace=True,
+                     names=['Variable', 'Project', 'Ratio'],
+                     usecols=['Project', 'Ratio']
+                     )
+
+    if sample_size > 0:
+        # feste seed zum besseren Vergleich
+        df = df.sample(sample_size, random_state=int(SEED))
+
+    if show:
+        df.info()
+        df.boxplot(column=['Ratio'], grid=False)
+        df.hist(column=['Ratio'], grid=False)
+        plt.title(title)
+        plt.show()
+
+    return df
+
+def cohen_d(x, y):
+    # Mittelwerte der Gruppen
+    mean_x, mean_y = x.mean(), y.mean()
+    mean_diff = mean_x - mean_y
+
+    # Varianzen der Gruppen
+    var_x, var_y = x.var(ddof=1), y.var(ddof=1)
+
+    # Stichprobengrößen der Gruppen
+    size_x, size_y = len(x), len(y)
+
+    # Gepoolte Varianz und Standardabweichung
+    pool_var = ((size_x - 1) * var_x + (size_y - 1) * var_y) / (size_x + size_y - 2)
+    pool_var = math.sqrt(pool_var)
+
+    # Cohen's d
+    d_val = mean_diff / pool_var
+    return d_val
+
+def find_barrier(x_sample_path, y_sample_path, alpha=0.1, lower_limit=2, upper_limit=10000):
+    left = lower_limit
+    right = upper_limit
+    barrier_size = -1
+
+    # Binäre Suche nach der Schranke
+    while left <= right:
+        mid = (left + right) // 2
+        x_sample = read_sample(x_sample_path, sample_size=mid, show=False)["Ratio"]
+        y_sample = read_sample(y_sample_path, sample_size=mid, show=False)["Ratio"]
+        stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
+
+        if p_value >= alpha:
+            # Vermerke die aktuelle untere Schranke
+            barrier_size = mid
+            # Kein signifikantes Ergebnis gefunden, probiere größere Probe
+            left = mid + 1
+        else:
+            # Signifikantes Ergebnis gefunden, probiere kleinere Probe
+            right = mid - 1
+
+    x_sample = read_sample(x_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]
+    y_sample = read_sample(y_sample_path, sample_size=barrier_size + 1, show=False)["Ratio"]
+    stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
+    print(f"\nSind die Unterschiede bei {barrier_size + 1} Proben signifikant? {str(p_value < alpha)}")
+
+    x_sample = read_sample(x_sample_path, sample_size=barrier_size, show=False)["Ratio"]
+    y_sample = read_sample(y_sample_path, sample_size=barrier_size, show=False)["Ratio"]
+    stat, p_value = stats.mannwhitneyu(x_sample, y_sample, alternative="two-sided")
+    print(f"Sind die Unterschiede bei {barrier_size} Proben signifikant? {str(p_value < alpha)}")
+    return barrier_size
+
+def main():
+    plt.close('all')
+    print('Statistische Berechnungen zu Häufigkeiten (Übung 5)')
+
+    print('\nEinlesen der ersten Stichprobe (Python)')
+    python_sample = read_sample(PYTHON_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Python")
+    print('Mean:' + str(python_sample['Ratio'].mean()))
+    print('Variance:' + str(python_sample['Ratio'].var()))
+
+    print('\nEinlesen der zweiten Stichprobe (Java)')
+    java_sample = read_sample(JAVA_INPUT, sample_size=1000, show=SHOW_PLOTS, title="Java")
+    print('Mean:' + str(java_sample['Ratio'].mean()))
+    print('Variance:' + str(java_sample['Ratio'].var()))
+
+    print('\nStatistische Tests')
+    # Aufgabenbearbeitung ab hier
+
+    # Mann-Whitney-U-Test
+    stat, p_value = stats.mannwhitneyu(python_sample['Ratio'], java_sample['Ratio'], alternative='two-sided')
+    effect_size = cohen_d(python_sample['Ratio'], java_sample['Ratio'])
+
+    print(f"Globaler Durchschnitt Python: {python_sample.get('Ratio').mean()}")
+    print(f"Globaler Durchschnitt Java: {java_sample.get('Ratio').mean()}")
+
+    print(f"Mann-Whitney-Test: Statistik {stat}, P-Wert {p_value}")
+    print(f"Effektstärke (Cohen's d): {effect_size}")
+
+    # Experimentelle Bestimmung der Schranke
+    barrier = find_barrier(PYTHON_INPUT, JAVA_INPUT, alpha=0.01)
+    print(f"Untere Schranke, ab welcher der Test nicht signifikant ist: {barrier}")
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    main()
+
+
+
--- a/boa/analysis/data/try_Java_Jan_2022_last_revision.boa.output.txt
+++ b/boa/analysis/data/try_Java_Jan_2022_last_revision.boa.output.txt
--- a/boa/analysis/data/try_Python_Feb_2022_last_revision.boa.output.txt
+++ b/boa/analysis/data/try_Python_Feb_2022_last_revision.boa.output.txt
--- a/boa/analysis/test.py
+++ b/boa/analysis/test.py
@@ -0,0 +1,9 @@
+import random
+
+random.seed(1)
+print(random.randint(0, 10))
+print(random.randint(0, 10))
+print(random.randint(0, 10))
+print(random.randint(0, 10))
+print(random.randint(0, 10))
+print(random.randint(0, 10))
--- a/boa/mining/aufgabe_01_a.boa
+++ b/boa/mining/aufgabe_01_a.boa
--- a/boa/mining/aufgabe_01_b.boa
+++ b/boa/mining/aufgabe_01_b.boa
@@ -1,5 +1,5 @@
-# Java-Job: 111128
-# Python-Job: 111124
+# Java-Job: 111694
+# Python-Job: 111439

 # Includes all revisions
 p: Project = input;
@@ -12,7 +12,7 @@ cur_date: time;

 statement_counter := visitor {
    before node: Statement -> {
-        if(node.kind == StatementKind.TRY or node.kind == StatementKind.WITH)
+        if (def(node.kind) and (node.kind == StatementKind.TRY or node.kind == StatementKind.WITH))
            relative_list[p.name][yearof(cur_date)] << 1;
        else
            relative_list[p.name][yearof(cur_date)] << 0;
@@ -22,12 +22,11 @@ statement_counter := visitor {
 visit(p, visitor {
    before node: CodeRepository -> {
        for (minus_year: int=22; minus_year >= 0; minus_year--) {
-            cur_date = addyear(now(), -minus_year);
+            cur_date = addyear(T"Dec 31, 2022, 10:00:00 AM", -minus_year);
            snapshot := getsnapshot(node, cur_date);

-            if (def(snapshot))
-                foreach (i: int; def(snapshot[i]))
-                    visit(snapshot[i], statement_counter);
+            foreach (i: int; def(snapshot[i]))
+                visit(snapshot[i], statement_counter);
        }
    }
 });
--- a/boa/mining/boa-job111005-output.txt
+++ b/boa/mining/boa-job111005-output.txt
--- a/boa/mining/boa-job111006-output.txt
+++ b/boa/mining/boa-job111006-output.txt
--- a/boa/mining/boa-job111439-output.txt
+++ b/boa/mining/boa-job111439-output.txt