diff --git "a/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3-1.ipynb" "b/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3-1.ipynb" new file mode 100644 index 0000000..a0202d8 --- /dev/null +++ "b/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3-1.ipynb" @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPQWJCZo/Gi/6T9yGB+46vN"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# 럭키백의 확률"],"metadata":{"id":"Z0CJMJOEpp_L"}},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"V5M_AL9gpSSj","executionInfo":{"status":"ok","timestamp":1728701797259,"user_tz":-540,"elapsed":1340,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"44ddc7b9-8ee0-402c-fd80-c2441a1dce35"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Species Weight Length Diagonal Height Width\n","0 Bream 242.0 25.4 30.0 11.5200 4.0200\n","1 Bream 290.0 26.3 31.2 12.4800 4.3056\n","2 Bream 340.0 26.5 31.1 12.3778 4.6961\n","3 Bream 363.0 29.0 33.5 12.7300 4.4555\n","4 Bream 430.0 29.0 34.0 12.4440 5.1340"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
SpeciesWeightLengthDiagonalHeightWidth
0Bream242.025.430.011.52004.0200
1Bream290.026.331.212.48004.3056
2Bream340.026.531.112.37784.6961
3Bream363.029.033.512.73004.4555
4Bream430.029.034.012.44405.1340
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"fish","summary":"{\n \"name\": \"fish\",\n \"rows\": 159,\n \"fields\": [\n {\n \"column\": \"Species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"Bream\",\n \"Roach\",\n \"Pike\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Weight\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 357.9783165508931,\n \"min\": 0.0,\n \"max\": 1650.0,\n \"num_unique_values\": 101,\n \"samples\": [\n 770.0,\n 51.5,\n 197.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Length\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 10.716328098884247,\n \"min\": 8.4,\n \"max\": 63.4,\n \"num_unique_values\": 93,\n \"samples\": [\n 14.7,\n 18.8,\n 19.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Diagonal\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11.610245832690964,\n \"min\": 8.8,\n \"max\": 68.0,\n \"num_unique_values\": 124,\n \"samples\": [\n 39.2,\n 27.2,\n 23.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Height\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4.286207619968867,\n \"min\": 1.7284,\n \"max\": 18.957,\n \"num_unique_values\": 154,\n \"samples\": [\n 15.438,\n 7.293,\n 2.8728\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Width\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.6858038699921671,\n \"min\": 1.0476,\n \"max\": 8.142,\n \"num_unique_values\": 152,\n \"samples\": [\n 3.1571,\n 1.3936,\n 3.6835\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":1}],"source":["import pandas as pd\n","fish = pd.read_csv('https://bit.ly/fish_csv_data') # csv 파일 데이터 직접 읽음\n","fish.head()"]},{"cell_type":"code","source":["# 어떤 종류의 생선이 있는지 unique 함수 사용하여 확인\n","print(pd.unique(fish['Species']))\n","\n","# Species 열을 타깃으로 만들고 나머지 5개 열은 입력 데이터로 사용하기 - 원하는 열을 리스트로 작성\n","fish_input = fish[['Weight', 'Length', 'Diagonal', 'Height', 'Width']].to_numpy()\n","print(fish_input[:5])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_TM8tyCXp_FV","executionInfo":{"status":"ok","timestamp":1728701924515,"user_tz":-540,"elapsed":3,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"8923db98-5923-4744-e842-376500cb355a"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']\n","[[242. 25.4 30. 11.52 4.02 ]\n"," [290. 26.3 31.2 12.48 4.3056]\n"," [340. 26.5 31.1 12.3778 4.6961]\n"," [363. 29. 33.5 12.73 4.4555]\n"," [430. 29. 34. 12.444 5.134 ]]\n"]}]},{"cell_type":"code","source":["fish_target = fish['Species'].to_numpy()\n","\n","# 머신러닝에는 기본으로 데이터 세트 2개가 필요.\n","from sklearn.model_selection import train_test_split\n","train_input, test_input, train_target, test_target = train_test_split(\n"," fish_input, fish_target, random_state=42\n",")"],"metadata":{"id":"64UeHjPGqb1s","executionInfo":{"status":"ok","timestamp":1728702009263,"user_tz":-540,"elapsed":348,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["# 훈련 세트의 통계값으로 테스트 세트 변환\n","from sklearn.preprocessing import StandardScaler\n","ss = StandardScaler()\n","ss.fit(train_input)\n","train_scaled = ss.transform(train_input)\n","test_scaled = ss.transform(test_input)"],"metadata":{"id":"-ziZlH2pq0xW","executionInfo":{"status":"ok","timestamp":1728702123270,"user_tz":-540,"elapsed":391,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":["# k-최근접 이웃 분류기의 확률 예측"],"metadata":{"id":"G71NsKUcrTJm"}},{"cell_type":"code","source":["from sklearn.neighbors import KNeighborsClassifier\n","\n","kn = KNeighborsClassifier(n_neighbors=3)\n","kn.fit(train_scaled, train_target)\n","\n","print(kn.score(train_scaled, train_target))\n","print(kn.score(test_scaled, test_target))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Sq6Zww_qrGys","executionInfo":{"status":"ok","timestamp":1728702171794,"user_tz":-540,"elapsed":697,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"bf30aa39-2e76-4661-a092-b4be7bbfda38"},"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":["0.8907563025210085\n","0.85\n"]}]},{"cell_type":"markdown","source":["타깃 데이터에 2개 이상의 클래스가 포함된 문제를 '다중 분류' 라고 부름.\n","타깃값을 그대로 사이킷런 모델에 전달하면 순서가 자동으로 알파벳 순으로 매겨짐.\n","따라서, pd.unique(fish['Species']) 로 출력했던 순서와는 상이함.\n"],"metadata":{"id":"is-rDObqrhua"}},{"cell_type":"code","source":["print(kn.classes_)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TbyDodn_rcXy","executionInfo":{"status":"ok","timestamp":1728702260600,"user_tz":-540,"elapsed":376,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"501305d3-eb28-4974-9d8d-5a4ef816d71b"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']\n"]}]},{"cell_type":"code","source":["print(kn.predict(test_scaled[:5])) # 테스트 세트에 있는 처음 5개 샘플의 타깃값 예측"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MQZ7uw1CryIX","executionInfo":{"status":"ok","timestamp":1728702286885,"user_tz":-540,"elapsed":352,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"7685f400-878c-4c4f-aa4a-86b9dab68641"},"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":["['Perch' 'Smelt' 'Pike' 'Perch' 'Perch']\n"]}]},{"cell_type":"markdown","source":["사이킷런 분류 모델 : predict_proba() 메서드로 클래스별 확률값을 반환.\n","numpy round() 함수는 소수점 첫째 자리에서 반올림, decimals 매개변수로 유지할 소수점 아래 자릿수를 지정할 수 있음."],"metadata":{"id":"mBd9odgBr7iK"}},{"cell_type":"code","source":["import numpy as np\n","proba = kn.predict_proba(test_scaled[:5])\n","print(np.round(proba, decimals=4)) # 소수점 네번째자리까지 표기, 다섯자리에서 반올림"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"THZRuEXMr4iX","executionInfo":{"status":"ok","timestamp":1728702445405,"user_tz":-540,"elapsed":357,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"d5d41f35-d8c4-4729-ef92-e50e72d3cc19"},"execution_count":10,"outputs":[{"output_type":"stream","name":"stdout","text":["[[0. 0. 1. 0. 0. 0. 0. ]\n"," [0. 0. 0. 0. 0. 1. 0. ]\n"," [0. 0. 0. 1. 0. 0. 0. ]\n"," [0. 0. 0.6667 0. 0.3333 0. 0. ]\n"," [0. 0. 0.6667 0. 0.3333 0. 0. ]]\n"]}]},{"cell_type":"code","source":["distances, indexes = kn.kneighbors(test_scaled[3:4]) # 슬라이싱 연산자는 하나의 배열만 선택해도 항상 2차원 배열이 생성됨.\n","print(train_target[indexes])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ewfitu5HsfPk","executionInfo":{"status":"ok","timestamp":1728702581773,"user_tz":-540,"elapsed":370,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"bc126346-cd00-4685-8c21-d6ab51233b2c"},"execution_count":11,"outputs":[{"output_type":"stream","name":"stdout","text":["[['Roach' 'Perch' 'Perch']]\n"]}]},{"cell_type":"markdown","source":["Roach 확률 1/3 , Perch 확률 2/3\n","그런데, 3개 최근접 이웃을 사용하니까 확률의 분모는 무조건 3인거 아닌가?"],"metadata":{"id":"dLOe8T7btfC5"}},{"cell_type":"markdown","source":["#로지스틱 회귀 : 분류 모델"],"metadata":{"id":"PpEPN6h7tot-"}},{"cell_type":"code","source":["# 확률이 아주 큰 음수일 때 0으로, 아주 큰 양수일 때 1이 되도록 하는 방법? 시그모이드 함수 또는 로지스틱 함수 사용\n","# 지수 함수 계산 : np.exp() 함수 사용\n","import numpy as np\n","import matplotlib.pyplot as plt\n","z = np.arange(-5, 5, 0.1)\n","phi = 1 / (1+np.exp(-z))\n","plt.plot(z,phi)\n","plt.xlabel('z')\n","plt.ylabel('phi')\n","plt.show"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":618},"id":"Vtb46WTPtAgw","executionInfo":{"status":"ok","timestamp":1728702949444,"user_tz":-540,"elapsed":823,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"4a55e036-89e0-4393-dc03-6ab1202c7254"},"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":[""],"text/html":["
\n","
matplotlib.pyplot.show
def show(*args, **kwargs)
/usr/local/lib/python3.10/dist-packages/matplotlib/pyplot.pyDisplay all open figures.\n","\n","Parameters\n","----------\n","block : bool, optional\n","    Whether to wait for all figures to be closed before returning.\n","\n","    If `True` block and run the GUI main loop until all figure windows\n","    are closed.\n","\n","    If `False` ensure that all figure windows are displayed and return\n","    immediately.  In this case, you are responsible for ensuring\n","    that the event loop is running to have responsive figures.\n","\n","    Defaults to True in non-interactive mode and to False in interactive\n","    mode (see `.pyplot.isinteractive`).\n","\n","See Also\n","--------\n","ion : Enable interactive mode, which shows / updates the figure after\n","      every plotting command, so that calling ``show()`` is not necessary.\n","ioff : Disable interactive mode.\n","savefig : Save the figure to an image file instead of showing it on screen.\n","\n","Notes\n","-----\n","**Saving figures to file and showing a window at the same time**\n","\n","If you want an image file as well as a user interface window, use\n","`.pyplot.savefig` before `.pyplot.show`. At the end of (a blocking)\n","``show()`` the figure is closed and thus unregistered from pyplot. Calling\n","`.pyplot.savefig` afterwards would save a new and thus empty figure. This\n","limitation of command order does not apply if the show is non-blocking or\n","if you keep a reference to the figure and use `.Figure.savefig`.\n","\n","**Auto-show in jupyter notebooks**\n","\n","The jupyter backends (activated via ``%matplotlib inline``,\n","``%matplotlib notebook``, or ``%matplotlib widget``), call ``show()`` at\n","the end of every cell by default. Thus, you usually don't have to call it\n","explicitly there.
\n"," \n","
"]},"metadata":{},"execution_count":12},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"markdown","source":["# 이진 분류 : 시그모이드 함수의 출력이 0.5보다 크면 양성 클래스, 0.5보다 작으면 음성 클래스\n","\n"],"metadata":{"id":"EP3GLClZuer_"}},{"cell_type":"code","source":["# 로지스틱 회귀로 이진 분류 수행하기 : 불리언 인덱싱\n","char_arr = np.array(['A','B','C','D','E'])\n","print(char_arr[[True, False, True, False, False]])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ti63UEYBuaMP","executionInfo":{"status":"ok","timestamp":1728703057627,"user_tz":-540,"elapsed":350,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"2402a6af-d1ea-409e-b43f-b2d217d688d9"},"execution_count":13,"outputs":[{"output_type":"stream","name":"stdout","text":["['A' 'C']\n"]}]},{"cell_type":"code","source":["bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')\n","train_bream_smelt = train_scaled[bream_smelt_indexes]\n","target_bream_smelt = train_target[bream_smelt_indexes]"],"metadata":{"id":"poWeSlX2u0uG","executionInfo":{"status":"ok","timestamp":1728703162941,"user_tz":-540,"elapsed":365,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":14,"outputs":[]},{"cell_type":"code","source":["from sklearn.linear_model import LogisticRegression\n","\n","lr = LogisticRegression()\n","lr.fit(train_bream_smelt, target_bream_smelt)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":80},"id":"0HJVY91IvObf","executionInfo":{"status":"ok","timestamp":1728703194022,"user_tz":-540,"elapsed":351,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"6b413744-b949-425a-eb78-1e5cb9ae6842"},"execution_count":15,"outputs":[{"output_type":"execute_result","data":{"text/plain":["LogisticRegression()"],"text/html":["
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"]},"metadata":{},"execution_count":15}]},{"cell_type":"code","source":["# 훈련 모델을 사용해 train_bream_smelt 에 있는 처음 5개 샘플 예측\n","print(lr.predict(train_bream_smelt[:5])) # 두번째 샘플 제외하고 모두 도미로 예측"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ayYednqtvWBd","executionInfo":{"status":"ok","timestamp":1728703212803,"user_tz":-540,"elapsed":367,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"2a9538fc-4d53-49b8-e102-5e50e3e0688c"},"execution_count":16,"outputs":[{"output_type":"stream","name":"stdout","text":["['Bream' 'Smelt' 'Bream' 'Bream' 'Bream']\n"]}]},{"cell_type":"code","source":["# 예측 확률은 predict_proba() 메서드에서 제공\n","print(lr.predict_proba(train_bream_smelt[:5]))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QEVH7kHjvamE","executionInfo":{"status":"ok","timestamp":1728703247136,"user_tz":-540,"elapsed":377,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"693a7d40-0973-4879-9534-2b9560b4a336"},"execution_count":17,"outputs":[{"output_type":"stream","name":"stdout","text":["[[0.99760007 0.00239993]\n"," [0.02737325 0.97262675]\n"," [0.99486386 0.00513614]\n"," [0.98585047 0.01414953]\n"," [0.99767419 0.00232581]]\n"]}]},{"cell_type":"code","source":["# 사이킷런은 타깃값을 알파벳순으로 정렬하여 사용\n","print(lr.classes_)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Tyu0kq0dvi-y","executionInfo":{"status":"ok","timestamp":1728703270005,"user_tz":-540,"elapsed":337,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"0665afc7-5176-40b8-a38c-2f0b2ecd1d47"},"execution_count":18,"outputs":[{"output_type":"stream","name":"stdout","text":["['Bream' 'Smelt']\n"]}]},{"cell_type":"code","source":["print(lr.coef_, lr.intercept_)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"5sU4Pu2ovokr","executionInfo":{"status":"ok","timestamp":1728703280273,"user_tz":-540,"elapsed":359,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"9b8503d1-2239-4c09-8fa6-f2d9b16e11b9"},"execution_count":19,"outputs":[{"output_type":"stream","name":"stdout","text":["[[-0.40451732 -0.57582787 -0.66248158 -1.01329614 -0.73123131]] [-2.16172774]\n"]}]},{"cell_type":"code","source":["# LogisticRegression 모델로 z 값을 계산할 수 있을까?\n","# LogisticRegression 클래스는 decision_function() 메서드로 z 값 출력 가능\n","decisions = lr.decision_function(train_bream_smelt[:5])\n","print(decisions)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"uMYw5X9kvrEr","executionInfo":{"status":"ok","timestamp":1728703333064,"user_tz":-540,"elapsed":366,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"8dca1331-c965-4874-b9fb-3747a37bb96e"},"execution_count":20,"outputs":[{"output_type":"stream","name":"stdout","text":["[-6.02991358 3.57043428 -5.26630496 -4.24382314 -6.06135688]\n"]}]},{"cell_type":"code","source":["# z 값을 시그모이드 함수에 통과, 확률 얻을 수 있음. expit() 사용.\n","from scipy.special import expit\n","print(expit(decisions))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"70ltNxgUv39W","executionInfo":{"status":"ok","timestamp":1728703380595,"user_tz":-540,"elapsed":366,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"325e0670-3d68-4b01-82a6-8ba1edd0f2c0"},"execution_count":21,"outputs":[{"output_type":"stream","name":"stdout","text":["[0.00239993 0.97262675 0.00513614 0.01414953 0.00232581]\n"]}]},{"cell_type":"markdown","source":["# 로지스틱 회귀로 다중 분류"],"metadata":{"id":"9JfK9AT-wG8d"}},{"cell_type":"code","source":["# LogisticRegression 클래스 : 반복적인 알고리즘 사용, 기본값은 100.\n","# 릿지 회귀와 같이 계수의 제곱을 규제.(L2 규제)\n","# 릿지 회귀 : alpha 매개변수로 규제 양 조잘 / LogisticRegression 클래스 : C는 alpha 와 반대로 작을수록 규제가 커짐.\n","lr = LogisticRegression(C=20, max_iter=1000)\n","lr.fit(train_scaled, train_target)\n","\n","print(lr.score(train_scaled, train_target))\n","print(lr.score(test_scaled, test_target))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"KxbgkEPuwDkF","executionInfo":{"status":"ok","timestamp":1728703493745,"user_tz":-540,"elapsed":346,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"8d38cb2d-7d71-415e-8ccd-0a3b79bedea6"},"execution_count":22,"outputs":[{"output_type":"stream","name":"stdout","text":["0.9327731092436975\n","0.925\n"]}]},{"cell_type":"code","source":["print(lr.predict(test_scaled[:5]))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"r095vMgPwfL8","executionInfo":{"status":"ok","timestamp":1728703499390,"user_tz":-540,"elapsed":349,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"b34798af-047e-4a7e-db90-4c78ebf141b1"},"execution_count":23,"outputs":[{"output_type":"stream","name":"stdout","text":["['Perch' 'Smelt' 'Pike' 'Roach' 'Perch']\n"]}]},{"cell_type":"code","source":["proba = lr.predict_proba(test_scaled[:5])\n","print(np.round(proba, decimals=3))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hZeRndCWwgkm","executionInfo":{"status":"ok","timestamp":1728703507145,"user_tz":-540,"elapsed":336,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"c1246a6c-7c59-4575-8c88-b026226e403d"},"execution_count":24,"outputs":[{"output_type":"stream","name":"stdout","text":["[[0. 0.014 0.842 0. 0.135 0.007 0.003]\n"," [0. 0.003 0.044 0. 0.007 0.946 0. ]\n"," [0. 0. 0.034 0.934 0.015 0.016 0. ]\n"," [0.011 0.034 0.305 0.006 0.567 0. 0.076]\n"," [0. 0. 0.904 0.002 0.089 0.002 0.001]]\n"]}]},{"cell_type":"code","source":["print(lr.classes_)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3X5JvetdwieC","executionInfo":{"status":"ok","timestamp":1728703518118,"user_tz":-540,"elapsed":374,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"bdcb183d-1ab9-42db-d04f-66df9ad0eff5"},"execution_count":25,"outputs":[{"output_type":"stream","name":"stdout","text":["['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']\n"]}]},{"cell_type":"code","source":["print(lr.coef_.shape, lr.intercept_.shape) # 다중 분류는 클래스마다 z 값을 하나씩 계산. 다중분류는 '소프트맥스(지수함수)함수' 사용."],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yUnBgGEPwlI-","executionInfo":{"status":"ok","timestamp":1728703527751,"user_tz":-540,"elapsed":353,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"e4524c6a-0bb5-4714-ee7e-1c15187a7915"},"execution_count":26,"outputs":[{"output_type":"stream","name":"stdout","text":["(7, 5) (7,)\n"]}]},{"cell_type":"code","source":["decision = lr.decision_function(test_scaled[:5])\n","print(np.round(decision, decimals=2))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"V6CG-egHwnf7","executionInfo":{"status":"ok","timestamp":1728703616211,"user_tz":-540,"elapsed":369,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"c67ed2a8-8473-4bf7-842d-02146ea6cc4a"},"execution_count":27,"outputs":[{"output_type":"stream","name":"stdout","text":["[[ -6.51 1.04 5.17 -2.76 3.34 0.35 -0.63]\n"," [-10.88 1.94 4.78 -2.42 2.99 7.84 -4.25]\n"," [ -4.34 -6.24 3.17 6.48 2.36 2.43 -3.87]\n"," [ -0.69 0.45 2.64 -1.21 3.26 -5.7 1.26]\n"," [ -6.4 -1.99 5.82 -0.13 3.5 -0.09 -0.7 ]]\n"]}]},{"cell_type":"code","source":["from scipy.special import softmax\n","\n","proba = softmax(decision, axis=1)\n","print(np.round(proba, decimals=3))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qaqT69Fnw9FY","executionInfo":{"status":"ok","timestamp":1728703623216,"user_tz":-540,"elapsed":358,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"80fe8443-3dca-455f-a37b-33145cbb2cea"},"execution_count":28,"outputs":[{"output_type":"stream","name":"stdout","text":["[[0. 0.014 0.842 0. 0.135 0.007 0.003]\n"," [0. 0.003 0.044 0. 0.007 0.946 0. ]\n"," [0. 0. 0.034 0.934 0.015 0.016 0. ]\n"," [0.011 0.034 0.305 0.006 0.567 0. 0.076]\n"," [0. 0. 0.904 0.002 0.089 0.002 0.001]]\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"72KulIJcw-zg"},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git "a/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3-2.ipynb" "b/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3-2.ipynb" new file mode 100644 index 0000000..76f5069 --- /dev/null +++ "b/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3-2.ipynb" @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPXPrs6PEHOmUsQMSXiu0IJ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# 확률적 경사 하강법"],"metadata":{"id":"Yc7U3Xu7xPqE"}},{"cell_type":"markdown","source":["점진적 학습 알고리즘 : 확률적 경사 하강법"],"metadata":{"id":"aM5c35DgxYa7"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"bhaTxmXzxNHx"},"outputs":[],"source":["# 확률적 경사 하강법에서 훈련 세트를 한 번 모두 사용하는 과정 : 에포크 라고 함.\n","# 여러개의 샘플을 사용해 경사하강법 수행하는 방식 : 미니배치 경사하강법 이라고 함.\n","# 한 번 경사로를 따라 이동하기 위해 전체 샘플 사용 : 배치 경사하강법 이라고 함."]},{"cell_type":"markdown","source":["가장 빠른 길을 찾는 과정 : 손실함수\n","손실 함수 : 머신러닝이 얼마나 엉터리인지 측정하는 기준. 즉, 손실함수 값이 작을수록 좋음"],"metadata":{"id":"eUhaI0hrxw1X"}},{"cell_type":"markdown","source":["# 로지스틱 손실 함수\n","\n","\n","\n","1. 양성 클래스 (타킷=1) 일 때 손실은 -log(예측 확률) 로 계산 :\n","확률은 1에서 멀어져 0에 가까워질수록 손실은 아주 큰 양수가 됨.\n","\n","2. 음성 클래스 (타깃=0) 일 때 손실은 -log(1-예측 확률)로 계산 :\n","확률이 0에서 멀어져 1에 가까워질수록 손실은 아주 큰 야웃가 됨\n","\n","\n"],"metadata":{"id":"p1lmIQN0x-Ub"}},{"cell_type":"markdown","source":["이진 분류 : 로지스틱 손실 함수 사용\n","다중 분류 : 크로스엔트로피 손실 함수 사용"],"metadata":{"id":"qqBMM8lnyfZ2"}},{"cell_type":"markdown","source":["# SGDClassifier"],"metadata":{"id":"Ig_iO--JymYc"}},{"cell_type":"code","source":["import pandas as pd\n","fish = pd.read_csv('https://bit.ly/fish_csv_data')"],"metadata":{"id":"G3XclUNhybPu","executionInfo":{"status":"ok","timestamp":1728704095807,"user_tz":-540,"elapsed":1707,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":1,"outputs":[]},{"cell_type":"code","source":["fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()\n","fish_target = fish['Species'].to_numpy()"],"metadata":{"id":"k499vL20ywDl","executionInfo":{"status":"ok","timestamp":1728704163731,"user_tz":-540,"elapsed":354,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["# 사이킷런 train_test_split() 함수 사용해 훈련세트, 테스트 세트 나눔\n","from sklearn.model_selection import train_test_split\n","\n","train_input, test_input, train_target, test_target = train_test_split(\n"," fish_input, fish_target, random_state=42)"],"metadata":{"id":"cgLPmX_5y7oX","executionInfo":{"status":"ok","timestamp":1728704196506,"user_tz":-540,"elapsed":2421,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["# 훈련 세트에서 학습한 통계 값으로 테스트 세트도 변환\n","from sklearn.preprocessing import StandardScaler\n","\n","ss = StandardScaler()\n","ss.fit(train_input)\n","train_scaled = ss.transform(train_input)\n","test_scaled = ss.transform(test_input)"],"metadata":{"id":"hH3gNxEPzKQM","executionInfo":{"status":"ok","timestamp":1728704207114,"user_tz":-540,"elapsed":355,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["# 사이킷런에서 확률적 경사하강법 제공하는 분류용 클래스 : SGDClassifier\n","from sklearn.linear_model import SGDClassifier\n","\n","sc = SGDClassifier(loss='log_loss', max_iter=10, random_state=42) # 전체 훈련 세트 10회 반복\n","sc.fit(train_scaled, train_target)\n","\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vzRaU9iSzNWt","executionInfo":{"status":"ok","timestamp":1728704245146,"user_tz":-540,"elapsed":385,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"ac6c9816-70b8-4ac4-d4eb-c45991b3463e"},"execution_count":6,"outputs":[{"output_type":"stream","name":"stdout","text":["0.773109243697479\n","0.775\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_stochastic_gradient.py:744: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.\n"," warnings.warn(\n"]}]},{"cell_type":"code","source":["# 모델을 이어서 훈련할 때에는 partial_fit() 메서드 사용 : fit() 메서드와 사용법 같지만, 호출할 때마다 1 에포크씩 이어서 훈련 가능\n","sc.partial_fit(train_scaled, train_target)\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ipb-aLorzWn_","executionInfo":{"status":"ok","timestamp":1728704319761,"user_tz":-540,"elapsed":363,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"fd52f08e-6486-4fba-cf0b-ff20393a3706"},"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":["0.8151260504201681\n","0.85\n"]}]},{"cell_type":"code","source":["sc.partial_fit(train_scaled, train_target)\n","\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Y2UcXKHfzoza","executionInfo":{"status":"ok","timestamp":1728704336196,"user_tz":-540,"elapsed":349,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"9f3df98c-e47b-4159-9b1d-36b81c1ac9ac"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["0.7815126050420168\n","0.8\n"]}]},{"cell_type":"markdown","source":["# 에포크와 과대/과소 적합"],"metadata":{"id":"SCcnZfy9z08F"}},{"cell_type":"code","source":["# 과대 적합이 시작하기 전에 훈련을 멈추는 것 : 조기 종료\n","# fit() 메서드 대신 partial_fit() 메서드 사용\n","\n","import numpy as np\n","sc = SGDClassifier(loss='log_loss', random_state=42)\n","train_score = []\n","test_score = []\n","classes = np.unique(train_target)"],"metadata":{"id":"w2Kufp_xzs3w","executionInfo":{"status":"ok","timestamp":1728704450887,"user_tz":-540,"elapsed":381,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["# for 문 사용하여 300번 에포크 동안 훈련 반복\n","for _ in range(0, 300):\n"," sc.partial_fit(train_scaled, train_target, classes=classes)\n","\n"," train_score.append(sc.score(train_scaled, train_target))\n"," test_score.append(sc.score(test_scaled, test_target))"],"metadata":{"id":"vEe-6cv70I3G","executionInfo":{"status":"ok","timestamp":1728704479204,"user_tz":-540,"elapsed":3255,"user":{"displayName":"조예인","userId":"17650117334011908449"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["import matplotlib.pyplot as plt\n","\n","plt.plot(train_score)\n","plt.plot(test_score)\n","plt.xlabel('epoch')\n","plt.ylabel('accuracy')\n","plt.show() # 100 번째 이후에는 점수가 벌어지는 경향."],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"eVjbv9yp0PEt","executionInfo":{"status":"ok","timestamp":1728704483891,"user_tz":-540,"elapsed":800,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"0c97edeb-f494-4b76-d71e-9b9ef8c15ab9"},"execution_count":11,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["sc = SGDClassifier(loss='log_loss', max_iter=100, tol=None, random_state=42)\n","sc.fit(train_scaled, train_target)\n","\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QnYaijuL0Q0Q","executionInfo":{"status":"ok","timestamp":1728704515356,"user_tz":-540,"elapsed":365,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"11ae6dce-e1b5-42ef-d313-822b1dd306b5"},"execution_count":12,"outputs":[{"output_type":"stream","name":"stdout","text":["0.957983193277311\n","0.925\n"]}]},{"cell_type":"code","source":["# 힌지 손실\n","sc = SGDClassifier(loss='hinge', max_iter=100, tol=None, random_state=42)\n","sc.fit(train_scaled, train_target)\n","\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"OUDu-8XE0Ymu","executionInfo":{"status":"ok","timestamp":1728704520425,"user_tz":-540,"elapsed":342,"user":{"displayName":"조예인","userId":"17650117334011908449"}},"outputId":"a7a8de04-6360-4e20-85f4-3b3cbf0c738e"},"execution_count":13,"outputs":[{"output_type":"stream","name":"stdout","text":["0.9495798319327731\n","0.925\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"nBF9hbKg0Z2W"},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git "a/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3.pdf" "b/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3.pdf" new file mode 100644 index 0000000..2ed3db8 Binary files /dev/null and "b/week3/[MLNovice]\354\241\260\354\230\210\354\235\270_week3.pdf" differ