{ "cells": [ { "cell_type": "markdown", "id": "a8cafc13", "metadata": {}, "source": [ "# Erkennen und Filtern von Ausreißern\n", "\n", "Das Filtern oder Transformieren von Ausreißern ist weitgehend eine Frage der Anwendung von Array-Operationen. Betrachtet einen DataFrame mit einigen normal verteilten Daten:" ] }, { "cell_type": "code", "execution_count": 1, "id": "3036ad5f", "metadata": { "execution": { "iopub.execute_input": "2026-05-22T13:29:06.434883Z", "iopub.status.busy": "2026-05-22T13:29:06.434514Z", "iopub.status.idle": "2026-05-22T13:29:06.665764Z", "shell.execute_reply": "2026-05-22T13:29:06.665467Z", "shell.execute_reply.started": "2026-05-22T13:29:06.434864Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.010284-0.0180330.012603-0.015950
std1.0069801.0154950.9891811.032210
min-3.175732-3.720231-3.093263-2.889419
25%-0.647827-0.683638-0.644732-0.747090
50%-0.030960-0.0117950.033418-0.023026
75%0.6658720.6773550.6564090.692767
max3.3103213.4845273.2756563.527782
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "count 1000.000000 1000.000000 1000.000000 1000.000000\n", "mean -0.010284 -0.018033 0.012603 -0.015950\n", "std 1.006980 1.015495 0.989181 1.032210\n", "min -3.175732 -3.720231 -3.093263 -2.889419\n", "25% -0.647827 -0.683638 -0.644732 -0.747090\n", "50% -0.030960 -0.011795 0.033418 -0.023026\n", "75% 0.665872 0.677355 0.656409 0.692767\n", "max 3.310321 3.484527 3.275656 3.527782" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "\n", "rng = np.random.default_rng()\n", "df = pd.DataFrame(rng.normal(size=(1000, 4)))\n", "\n", "df.describe()" ] }, { "cell_type": "markdown", "id": "66bbe4f3", "metadata": {}, "source": [ "Angenommen, ihr wollt in einer der Spalten Werte finden, deren absoluter Wert größer als 3 oder kleiner als -3 ist:" ] }, { "cell_type": "code", "execution_count": 2, "id": "7706abfa", "metadata": { "execution": { "iopub.execute_input": "2026-05-22T13:29:06.666312Z", "iopub.status.busy": "2026-05-22T13:29:06.666191Z", "iopub.status.idle": "2026-05-22T13:29:06.668931Z", "shell.execute_reply": "2026-05-22T13:29:06.668635Z", "shell.execute_reply.started": "2026-05-22T13:29:06.666304Z" } }, "outputs": [ { "data": { "text/plain": [ "254 -3.238330\n", "422 3.484527\n", "464 -3.437452\n", "476 -3.019482\n", "499 -3.094016\n", "621 -3.720231\n", "674 3.145420\n", "Name: 1, dtype: float64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "col = df[1]\n", "\n", "col[col.abs() > 3]" ] }, { "cell_type": "markdown", "id": "71ecdd17", "metadata": {}, "source": [ "Um alle Zeilen auszuwählen, in denen Wert größer 3 oder kleiner -3 in einer der Spalten ist, könnt ihr [pandas.DataFrame.any](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.any.html) auf einen booleschen DataFrame anwenden, wobei mit `any(axis=1)` üüberprüft wird, ob ein Wert in einer Zeile wahr ist:" ] }, { "cell_type": "code", "execution_count": 3, "id": "337e4285", "metadata": { "execution": { "iopub.execute_input": "2026-05-22T13:29:06.669416Z", "iopub.status.busy": "2026-05-22T13:29:06.669337Z", "iopub.status.idle": "2026-05-22T13:29:06.673368Z", "shell.execute_reply": "2026-05-22T13:29:06.673181Z", "shell.execute_reply.started": "2026-05-22T13:29:06.669409Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
68-0.9133200.675847-0.2375113.070655
1971.227823-0.5512393.2756560.875736
2540.562163-3.238330-0.808823-0.130129
3190.0112010.110650-1.3368743.000621
3700.9980571.127185-0.6480433.117027
404-0.340538-2.698506-3.093263-0.399621
4093.3103211.4423380.362357-1.605350
4220.1755413.4845270.432583-1.932812
4423.2887271.135310-0.3687621.578203
4641.496301-3.4374521.0504520.070361
470-0.1877020.4007443.081391-0.937770
4761.191677-3.019482-0.0254821.082824
4990.553608-3.094016-0.1019790.225328
6211.266543-3.720231-0.3642751.231816
661-1.055270-0.4868290.6170433.527782
6741.0355283.1454201.2982551.106587
901-3.1757320.553589-0.7676430.718555
9463.252568-0.487513-0.1074561.845214
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "68 -0.913320 0.675847 -0.237511 3.070655\n", "197 1.227823 -0.551239 3.275656 0.875736\n", "254 0.562163 -3.238330 -0.808823 -0.130129\n", "319 0.011201 0.110650 -1.336874 3.000621\n", "370 0.998057 1.127185 -0.648043 3.117027\n", "404 -0.340538 -2.698506 -3.093263 -0.399621\n", "409 3.310321 1.442338 0.362357 -1.605350\n", "422 0.175541 3.484527 0.432583 -1.932812\n", "442 3.288727 1.135310 -0.368762 1.578203\n", "464 1.496301 -3.437452 1.050452 0.070361\n", "470 -0.187702 0.400744 3.081391 -0.937770\n", "476 1.191677 -3.019482 -0.025482 1.082824\n", "499 0.553608 -3.094016 -0.101979 0.225328\n", "621 1.266543 -3.720231 -0.364275 1.231816\n", "661 -1.055270 -0.486829 0.617043 3.527782\n", "674 1.035528 3.145420 1.298255 1.106587\n", "901 -3.175732 0.553589 -0.767643 0.718555\n", "946 3.252568 -0.487513 -0.107456 1.845214" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df.abs() > 3).any(axis=1)]" ] }, { "cell_type": "markdown", "id": "5a6c3d55", "metadata": {}, "source": [ "Auf dieser Grundlage können die Werte begrenzt werden auf ein Intervall zwischen `-3` und `3`. Hierfür verwenden wir die Anweisung `np.sign(df)`, die Werte `1` und `-1` erzeugt, je nachdem, ob die Werte in `df` positiv oder negativ sind:" ] }, { "cell_type": "code", "execution_count": 4, "id": "a0d7f3e6", "metadata": { "execution": { "iopub.execute_input": "2026-05-22T13:29:06.673706Z", "iopub.status.busy": "2026-05-22T13:29:06.673629Z", "iopub.status.idle": "2026-05-22T13:29:06.680715Z", "shell.execute_reply": "2026-05-22T13:29:06.680386Z", "shell.execute_reply.started": "2026-05-22T13:29:06.673699Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.010960-0.0171530.012340-0.016666
std1.0037671.0086520.9877701.029968
min-3.000000-3.000000-3.000000-2.889419
25%-0.647827-0.683638-0.644732-0.747090
50%-0.030960-0.0117950.033418-0.023026
75%0.6658720.6773550.6564090.692767
max3.0000003.0000003.0000003.000000
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "count 1000.000000 1000.000000 1000.000000 1000.000000\n", "mean -0.010960 -0.017153 0.012340 -0.016666\n", "std 1.003767 1.008652 0.987770 1.029968\n", "min -3.000000 -3.000000 -3.000000 -2.889419\n", "25% -0.647827 -0.683638 -0.644732 -0.747090\n", "50% -0.030960 -0.011795 0.033418 -0.023026\n", "75% 0.665872 0.677355 0.656409 0.692767\n", "max 3.000000 3.000000 3.000000 3.000000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.abs() > 3] = np.sign(df) * 3\n", "\n", "df.describe()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.13 Kernel", "language": "python", "name": "python313" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }