diff --git a/docs/assignment-2/index.html b/docs/assignment-2/index.html index 3842ef4..ea5ec84 100644 --- a/docs/assignment-2/index.html +++ b/docs/assignment-2/index.html @@ -9,16 +9,15 @@ a.url { word-break: break-all; } a:active, a:hover { outline: 0px; } .in-text-selection, ::selection { text-shadow: none; background: var(--select-text-bg-color); color: var(--select-text-font-color); } -#write { margin: 0px auto; height: auto; width: inherit; word-break: normal; word-wrap: break-word; position: relative; white-space: normal; overflow-x: visible; padding-top: 40px; } +#write { margin: 0px auto; height: auto; width: inherit; word-break: normal; word-wrap: break-word; position: relative; white-space: normal; overflow-x: visible; } #write.first-line-indent p { text-indent: 2em; } #write.first-line-indent li p, #write.first-line-indent p * { text-indent: 0px; } #write.first-line-indent li { margin-left: 2em; } .for-image #write { padding-left: 8px; padding-right: 8px; } body.typora-export { padding-left: 30px; padding-right: 30px; } -.typora-export .footnote-line, .typora-export li, .typora-export p { white-space: pre-wrap; } +.typora-export .footnote-line, .typora-export p { white-space: pre-wrap; } @media screen and (max-width: 500px) { body.typora-export { padding-left: 0px; padding-right: 0px; } - #write { padding-left: 20px; padding-right: 20px; } .CodeMirror-sizer { margin-left: 0px !important; } .CodeMirror-gutters { display: none !important; } } @@ -50,9 +49,8 @@ tr { break-inside: avoid; break-after: auto; } thead { display: table-header-group; } table { border-collapse: collapse; border-spacing: 0px; width: 100%; overflow: auto; break-inside: auto; text-align: left; } -table.md-table td { min-width: 32px; } +table.md-table td { min-width: 80px; } .CodeMirror-gutters { border-right: 0px; background-color: inherit; } -.CodeMirror-linenumber { user-select: none; } .CodeMirror { text-align: left; } .CodeMirror-placeholder { opacity: 0.3; } .CodeMirror pre { padding: 0px 4px; } @@ -82,25 +80,25 @@ #write .footnote-line { white-space: pre-wrap; } @media print { body, html { border: 1px solid transparent; height: 99%; break-after: avoid; break-before: avoid; } - #write { margin-top: 0px; padding-top: 0px; border-color: transparent !important; } + #write { margin-top: 0px; border-color: transparent !important; } .typora-export * { -webkit-print-color-adjust: exact; } html.blink-to-pdf { font-size: 13px; } - .typora-export #write { padding-left: 32px; padding-right: 32px; padding-bottom: 0px; break-after: avoid; } + .typora-export #write { padding-left: 1cm; padding-right: 1cm; padding-bottom: 0px; break-after: avoid; } .typora-export #write::after { height: 0px; } @page { margin: 20mm 0px; } } .footnote-line { margin-top: 0.714em; font-size: 0.7em; } a img, img a { cursor: pointer; } pre.md-meta-block { font-size: 0.8rem; min-height: 0.8rem; white-space: pre-wrap; background: rgb(204, 204, 204); display: block; overflow-x: hidden; } -p > .md-image:only-child:not(.md-img-error) img, p > img:only-child { display: block; margin: auto; } -p > .md-image:only-child { display: inline-block; width: 100%; } +p > img:only-child { display: block; margin: auto; } +p > .md-image:only-child { display: inline-block; width: 100%; text-align: center; } #write .MathJax_Display { margin: 0.8em 0px 0px; } .md-math-block { width: 100%; } .md-math-block:not(:empty)::after { display: none; } [contenteditable="true"]:active, [contenteditable="true"]:focus { outline: 0px; box-shadow: none; } .md-task-list-item { position: relative; list-style-type: none; } .task-list-item.md-task-list-item { padding-left: 0px; } -.md-task-list-item > input { position: absolute; top: 0px; left: 0px; margin-left: -1.2em; margin-top: calc(1em - 10px); border: none; } +.md-task-list-item > input { position: absolute; top: 0px; left: 0px; margin-left: -1.2em; margin-top: calc(1em - 10px); } .math { font-size: 1rem; } .md-toc { min-height: 3.58rem; position: relative; font-size: 0.9rem; border-radius: 10px; } .md-toc-content { position: relative; margin-left: 0px; } @@ -131,7 +129,6 @@ code { text-align: left; vertical-align: initial; } a.md-print-anchor { white-space: pre !important; border-width: initial !important; border-style: none !important; border-color: initial !important; display: inline-block !important; position: absolute !important; width: 1px !important; right: 0px !important; outline: 0px !important; background: 0px 0px !important; text-decoration: initial !important; text-shadow: initial !important; } .md-inline-math .MathJax_SVG .noError { display: none !important; } -.html-for-mac .inline-math-svg .MathJax_SVG { vertical-align: 0.2px; } .md-math-block .MathJax_SVG_Display { text-align: center; margin: 0px; position: relative; text-indent: 0px; max-width: none; max-height: none; min-height: 0px; min-width: 100%; width: auto; overflow-y: hidden; display: block !important; } .MathJax_SVG_Display, .md-inline-math .MathJax_SVG_Display { width: auto; margin: inherit; display: inline-block !important; } .MathJax_SVG .MJX-monospace { font-family: var(--monospace); } @@ -222,7 +219,12 @@ } -:root { --active-file-bg-color: rgba(32, 43, 51, 0.63); --active-file-text-color: white; --bg-color: #f3f2ee; --text-color: #1f0909; --control-text-color: #444; --rawblock-edit-panel-bd: #e5e5e5; --select-text-bg-color: rgba(32, 43, 51, 0.63); --select-text-font-color: white; } +:root { --rawblock-edit-panel-bd: #e5e5e5; } +@font-face { font-family: "PT Serif"; font-style: normal; font-weight: normal; src: local("PT Serif"), local("PTSerif-Regular"), url("./newsprint/pt-serif-v9-latin-regular.woff") format("woff"); } +@font-face { font-family: "PT Serif"; font-style: italic; font-weight: normal; src: local("PT Serif"), local("PTSerif-Italic"), url("./newsprint/pt-serif-v9-latin-italic.woff") format("woff"); } +@font-face { font-family: "PT Serif"; font-style: normal; font-weight: bold; src: local("PT Serif"), local("PTSerif-Bold"), url("./newsprint/pt-serif-v9-latin-700.woff") format("woff"); } +@font-face { font-family: "PT Serif"; font-style: italic; font-weight: bold; src: local("PT Serif"), local("PTSerif-BoldItalic"), url("./newsprint/pt-serif-v9-latin-700italic.woff") format("woff"); } +:root { --active-file-bg-color: rgba(32, 43, 51, 0.63); --active-file-text-color: white; --bg-color: #f3f2ee; --text-color: #1f0909; --select-text-bg-color: rgba(32, 43, 51, 0.63); --select-text-font-color: white; } pre { --select-text-bg-color: #36284e; --select-text-font-color: #fff; } html { font-size: 16px; } html, body { background-color: rgb(243, 242, 238); font-family: "PT Serif", "Times New Roman", Times, serif; color: rgb(31, 9, 9); line-height: 1.5em; } @@ -251,21 +253,20 @@ thead th, tfoot th { padding: 0.25em 0.25em 0.25em 0.4em; text-transform: uppercase; } th { text-align: left; } td { vertical-align: top; padding: 0.25em 0.25em 0.25em 0.4em; } -code, .md-fences { background-color: rgb(218, 218, 218); } -code { padding-left: 2px; padding-right: 2px; } -.md-fences { margin-left: 2em; margin-bottom: 3em; padding-left: 1ch; padding-right: 1ch; } +code, .md-fences { background-color: rgb(218, 218, 218); padding-left: 1ch; padding-right: 1ch; } +.md-fences { margin-left: 2em; margin-bottom: 3em; } pre, code, tt { font-size: 0.875em; line-height: 1.71429em; } h1 { line-height: 1.3em; font-weight: normal; margin-bottom: 0.5em; } -p + ul, p + ol { margin-top: 0.5em; } +p + ul, p + ol { margin-top: -1em; } h3 + ul, h4 + ul, h5 + ul, h6 + ul, h3 + ol, h4 + ol, h5 + ol, h6 + ol { margin-top: 0.5em; } li > ul, li > ol { margin-top: inherit; margin-bottom: 0px; } -li ol > li { list-style-type: lower-alpha; } -li li ol > li { list-style-type: lower-roman; } +li li { list-style-type: lower-alpha; } +li li li { list-style-type: lower-roman; } +li > blockquote { margin-bottom: 0px; } h2, h3 { margin-bottom: 0.75em; } hr { border-top: none; border-right: none; border-bottom: 1px solid; border-left: none; } h1 { border-color: rgb(197, 197, 197); } blockquote { border-color: rgb(186, 186, 186); color: rgb(101, 101, 101); } -blockquote ul, blockquote ol { margin-left: 0px; } .ty-table-edit { background-color: transparent; } thead { background-color: rgb(218, 218, 218); } tr:nth-child(2n) { background: rgb(232, 231, 231); } @@ -322,17 +323,15 @@ input { border: 1px solid rgb(170, 170, 170); } .megamenu-menu-header #megamenu-menu-header-title, .megamenu-menu-header:hover, .megamenu-menu-header:focus { color: inherit; } .dropdown-menu .divider { border-color: rgb(229, 229, 229); } -.os-windows-7 strong, .os-windows-7 strong { font-weight: 760; } - .typora-export li, .typora-export p, .typora-export, .footnote-line {white-space: normal;} + .typora-export p, .typora-export .footnote-line {white-space: normal;} - -

Pattern Recognition and Machine Learning

Fudan University / 2019 Spring

Assignment 2

In this assignment you are going to explore several well-know linear classification methods such as perceptron and logistic regression, and a realistic dataset will be provided for you to evaluate these methods.

Description

Part 1

To start with, we consider the least square model as an extension for the linear regression model to the problem of classification, and also the perceptron algorithm. You are given a simple linearly separable dataset containing 2 classes of points on a 2D plane, and you should build a model to correctly classify them.

You are provided with a function gen_linear_seperatable_2d_2c_dataset in the handout file, you should import it to your solution as in the first assignment.

dataset

Requirements

Namely speaking, you should use least square model and the perceptron algorithm to learn two models to separate the dataset, and report the accuracy after you have learned the model.

Since the two models are linear, you should be able to draw a decision line on top of the dataset to visually show how can you separate the dataset. Include this plot in you report.

Part 2

In this part of the assignment, you are required to use logistic regression to do a simple text classification task. A function get_text_classification_datasets is also provided, which returns the training and testing dataset, and you could explore the dataset by looking more closely to what it returns. Note that calling the function for the first time will takes several minutes to cache the dataset to your disk (the path is ../../.. which is parallel to the root of the project directory).

Text classification is about to classify a given document with its corresponding genre, e.g. sports, news, health etc. To represent the document in a convenient way to be processed by the logistic regression, we could represent the document in a multi-hot style vector.

Firstly you will have to tokenize the document into words (or a list of strings), you should firstly ignore all the characters in string.punctuation and then make all string.whitespace characters a space , and then split the document by all the spaces to have a list of strings. Then you build a vocabulary on all the words in the training dataset, which simply maps each word in the training dataset to a number. For example,

When tokenized you will have

and your vocabulary will look like

and you use the vocabulary to map the tokenized document to a multi-hot vector!

as you could verify this is the representation from the above two document.

Once you could represent the document in vectors (and also the category of the document in one-hot representation), then you can use the logistic regression!

Logistic regression is a kind of generalized linear model, the major (or only?) difference between logistic regression and least square is that in logistic regression we use a non-linear function after the linear transformation to enable probabilistic interpretation for the output. For binary classification, the logistic sigmoid function

transforms the unbounded prediction from the output of the linear model to a bounded interval , which could be interpreted as the probability of the prediction.

Note that logistic sigmoid function is just a special case of the following softmax function. Given a vector of the dimension , the softmax function computes the vector with the following components:

In other words, the softmax function first exponentiates the vector (elementwise) and then normalizes it such that all the components would add up to 1. The resulting vector can be interpreted as a probability distribution over the number of classes. We can then make prediction through selecting a class with the maximum associated probability:

One interesting property of the softmax function is that it is invariant to constant offsets, i.e. , where is a broadcasted vector of equal constant values. This means you could (and perhaps should) subtract the maximum from before to stabilize the numerical calculation.

To sum up, for logistic regression the predicted probability distributions are computed as:

Because the existence of the softmax function, we could not write a closed solution for the optimization, therefore we will use gradient descent to optimize a loss function, as is typically done in machine learning, we minimize a loss function on top of the prediction of the linear model. Concretely we use the cross entropy loss function defined as

where is the predicted probability of the correct class for the -th training example and is the total number of training examples. In order to reduce overfitting, an additional term penalizing large weights is added to the loss function as:

Stochastic gradient descent starts by taking the gradient of each parameter with respect to the loss, and then update the parameter with the gradient. Namely

where is a hyperparameter called learning rate that adjusts the magnitude of the weight updates.

Since the loss function of logistic regression is a convex function (as you could verify by differentiating the loss function twice), you are promised to get to the global minimum of the loss function with gradient descent.

To conclude, the requirements are:

  1. Implement the preprocess pipeline to transform the documents into multi-hot vector representation, and also the targets into one-hot representation, indicate how you implemented them in your report
  2. Differentiate the loss function for logistic regression, write down how you compute , and then implement the calculation in vectorized style in numpy (meaning you should not use any explicit loop to obtain the gradient). Answer the two questions in your report: (1) Sometimes, to overcome overfitting, people also use L2 regularization for logistic regularization, if you add the L2 regularization, should you regularize the bias term? (2) how do you check your gradient calculation is correct?
  3. Finish the training for the logistic regression model on the training dataset, include the plot for the loss curve you obtained during the training of the model. Answer the questions: (1) how do you determine the learning rate? (2) how do you determine when to terminate the training procedure?
  4. Somethings, other than doing a full batch gradient descent (where you take into consideration all the training data during the calculation of the gradient), people use stochastic gradient descent or batched gradient descent (meaning only one sample or several samples per update), you should also experiment with the other 2 ways of doing gradient descent with logistic regression. Answer the questions: (1) what do you observe by doing the other 2 different ways of gradient descent? (2) can you tell what are the pros and cons of each of the three different gradient update strategies?
  5. Report your result for the three differently trained model on the test dataset.

    Do not peek into the test dataset before this requirement!

+ +

Pattern Recognition and Machine Learning

Fudan University / 2019 Spring

Assignment 2

In this assignment you are going to explore several well-know linear classification methods such as perceptron and logistic regression, and a realistic dataset will be provided for you to evaluate these methods.

Description

Part 1

To start with, we consider the least square model as an extension for the linear regression model to the problem of classification, and also the perceptron algorithm. You are given a simple linearly separable dataset containing 2 classes of points on a 2D plane, and you should build a model to correctly classify them.

You are provided with a function gen_linear_seperatable_2d_2c_dataset in the handout file, you should import it to your solution as in the first assignment.

dataset

Requirements

Namely speaking, you should use least square model and the perceptron algorithm to learn two models to separate the dataset, and report the accuracy after you have learned the model.

Since the two models are linear, you should be able to draw a decision line on top of the dataset to visually show how can you separate the dataset. Include this plot in you report.

Part 2

In this part of the assignment, you are required to use logistic regression to do a simple text classification task. A function get_text_classification_datasets is also provided, which returns the training and testing dataset, and you could explore the dataset by looking more closely to what it returns. Note that calling the function for the first time will takes several minutes to cache the dataset to your disk (the path is ../../.. which is parallel to the root of the project directory).

Text classification is about to classify a given document with its corresponding genre, e.g. sports, news, health etc. To represent the document in a convenient way to be processed by the logistic regression, we could represent the document in a multi-hot style vector.

Firstly you will have to tokenize the document into words (or a list of strings), you should firstly ignore all the characters in string.punctuation and then make all string.whitespace characters a space , and then split the document by all the spaces to have a list of strings. Subsequently, you should convert all characters into lowercase to facilitate future process. Then you build a vocabulary on all the words in the training dataset, which simply maps each word in the training dataset to a number. For example,

When tokenized you will have

and your vocabulary will look like

and you use the vocabulary to map the tokenized document to a multi-hot vector!

as you could verify this is the representation from the above two document.

In practice, the vocabulary dictionary is quite large, which may cause the size of multi-hot vector exceeds the memory limits! To address this problem, you can set a frequency threshold min_count and only consider the words which occur at least min_count times in the overall training set. For this problem, min_count = 10 is suitable.

Once you could represent the document in vectors (and also the category of the document in one-hot representation), then you can use the logistic regression!

Logistic regression is a kind of generalized linear model, the major (or only?) difference between logistic regression and least square is that in logistic regression we use a non-linear function after the linear transformation to enable probabilistic interpretation for the output. For binary classification, the logistic sigmoid function

transforms the unbounded prediction from the output of the linear model to a bounded interval , which could be interpreted as the probability of the prediction.

Note that logistic sigmoid function is just a special case of the following softmax function. Given a vector of the dimension , the softmax function computes the vector with the following components:

In other words, the softmax function first exponentiates the vector (elementwise) and then normalizes it such that all the components would add up to 1. The resulting vector can be interpreted as a probability distribution over the number of classes. We can then make prediction through selecting a class with the maximum associated probability:

One interesting property of the softmax function is that it is invariant to constant offsets, i.e. , where is a broadcasted vector of equal constant values. This means you could (and perhaps should) subtract the maximum from before to stabilize the numerical calculation.

To sum up, for logistic regression the predicted probability distributions are computed as:

Because the existence of the softmax function, we could not write a closed solution for the optimization, therefore we will use gradient descent to optimize a loss function, as is typically done in machine learning, we minimize a loss function on top of the prediction of the linear model. Concretely we use the cross entropy loss function defined as

where is the predicted probability of the correct class for the -th training example and is the total number of training examples. In order to reduce overfitting, an additional term penalizing large weights is added to the loss function as:

Stochastic gradient descent starts by taking the gradient of each parameter with respect to the loss, and then update the parameter with the gradient. Namely

where is a hyperparameter called learning rate that adjusts the magnitude of the weight updates.

Since the loss function of logistic regression is a convex function (as you could verify by differentiating the loss function twice), you are promised to get to the global minimum of the loss function with gradient descent.

To conclude, the requirements are:

  1. Implement the preprocess pipeline to transform the documents into multi-hot vector representation, and also the targets into one-hot representation, indicate how you implemented them in your report
  2. Differentiate the loss function for logistic regression, write down how you compute , and then implement the calculation in vectorized style in numpy (meaning you should not use any explicit loop to obtain the gradient). Answer the two questions in your report: (1) Sometimes, to overcome overfitting, people also use L2 regularization for logistic regularization, if you add the L2 regularization, should you regularize the bias term? (2) how do you check your gradient calculation is correct?
  3. Finish the training for the logistic regression model on the training dataset, include the plot for the loss curve you obtained during the training of the model. Answer the questions: (1) how do you determine the learning rate? (2) how do you determine when to terminate the training procedure?
  4. Somethings, other than doing a full batch gradient descent (where you take into consideration all the training data during the calculation of the gradient), people use stochastic gradient descent or batched gradient descent (meaning only one sample or several samples per update), you should also experiment with the other 2 ways of doing gradient descent with logistic regression. Answer the questions: (1) what do you observe by doing the other 2 different ways of gradient descent? (2) can you tell what are the pros and cons of each of the three different gradient update strategies?
  5. Report your result for the three differently trained model on the test dataset.

    Do not peek into the test dataset before this requirement!

\ No newline at end of file diff --git a/docs/assignment-2/index.md b/docs/assignment-2/index.md index 276d731..e017c2c 100644 --- a/docs/assignment-2/index.md +++ b/docs/assignment-2/index.md @@ -28,7 +28,7 @@ In this part of the assignment, you are required to use logistic regression to d Text classification is about to classify a given document with its corresponding genre, e.g. sports, news, health etc. To represent the document in a convenient way to be processed by the logistic regression, we could represent the document in a multi-hot style vector. -Firstly you will have to tokenize the document into words (or a list of strings), you should firstly ignore all the characters in `string.punctuation` and then make all `string.whitespace` characters a space , and then split the document by all the spaces to have a list of strings. Then you build a vocabulary on all the words in the training dataset, which simply maps each word in the training dataset to a number. For example, +Firstly you will have to tokenize the document into words (or a list of strings), you should firstly ignore all the characters in `string.punctuation` and then make all `string.whitespace` characters a space , and then split the document by all the spaces to have a list of strings. Subsequently, you should convert all characters into lowercase to facilitate future process. Then you build a vocabulary on all the words in the training dataset, which simply maps each word in the training dataset to a number. For example, ```python docs_toy = [ @@ -66,6 +66,8 @@ and you use the vocabulary to map the tokenized document to a multi-hot vector! as you could verify this is the representation from the above two document. +In practice, the vocabulary dictionary is quite large, which may cause the size of multi-hot vector exceeds the memory limits! To address this problem, you can set a frequency threshold `min_count` and only consider the words which occur at least `min_count` times in the overall training set. For this problem, `min_count = 10` is suitable. + Once you could represent the document in vectors (and also the category of the document in one-hot representation), then you can use the logistic regression! Logistic regression is a kind of generalized linear model, the major (or only?) difference between logistic regression and least square is that in logistic regression we use a non-linear function after the linear transformation to enable probabilistic interpretation for the output. For binary classification, the logistic sigmoid function