diff --git a/docs/assignment-2/index.html b/docs/assignment-2/index.html index 3842ef4..ea5ec84 100644 --- a/docs/assignment-2/index.html +++ b/docs/assignment-2/index.html @@ -9,16 +9,15 @@ a.url { word-break: break-all; } a:active, a:hover { outline: 0px; } .in-text-selection, ::selection { text-shadow: none; background: var(--select-text-bg-color); color: var(--select-text-font-color); } -#write { margin: 0px auto; height: auto; width: inherit; word-break: normal; word-wrap: break-word; position: relative; white-space: normal; overflow-x: visible; padding-top: 40px; } +#write { margin: 0px auto; height: auto; width: inherit; word-break: normal; word-wrap: break-word; position: relative; white-space: normal; overflow-x: visible; } #write.first-line-indent p { text-indent: 2em; } #write.first-line-indent li p, #write.first-line-indent p * { text-indent: 0px; } #write.first-line-indent li { margin-left: 2em; } .for-image #write { padding-left: 8px; padding-right: 8px; } body.typora-export { padding-left: 30px; padding-right: 30px; } -.typora-export .footnote-line, .typora-export li, .typora-export p { white-space: pre-wrap; } +.typora-export .footnote-line, .typora-export p { white-space: pre-wrap; } @media screen and (max-width: 500px) { body.typora-export { padding-left: 0px; padding-right: 0px; } - #write { padding-left: 20px; padding-right: 20px; } .CodeMirror-sizer { margin-left: 0px !important; } .CodeMirror-gutters { display: none !important; } } @@ -50,9 +49,8 @@ tr { break-inside: avoid; break-after: auto; } thead { display: table-header-group; } table { border-collapse: collapse; border-spacing: 0px; width: 100%; overflow: auto; break-inside: auto; text-align: left; } -table.md-table td { min-width: 32px; } +table.md-table td { min-width: 80px; } .CodeMirror-gutters { border-right: 0px; background-color: inherit; } -.CodeMirror-linenumber { user-select: none; } .CodeMirror { text-align: left; } .CodeMirror-placeholder { opacity: 0.3; } .CodeMirror pre { padding: 0px 4px; } @@ -82,25 +80,25 @@ #write .footnote-line { white-space: pre-wrap; } @media print { body, html { border: 1px solid transparent; height: 99%; break-after: avoid; break-before: avoid; } - #write { margin-top: 0px; padding-top: 0px; border-color: transparent !important; } + #write { margin-top: 0px; border-color: transparent !important; } .typora-export * { -webkit-print-color-adjust: exact; } html.blink-to-pdf { font-size: 13px; } - .typora-export #write { padding-left: 32px; padding-right: 32px; padding-bottom: 0px; break-after: avoid; } + .typora-export #write { padding-left: 1cm; padding-right: 1cm; padding-bottom: 0px; break-after: avoid; } .typora-export #write::after { height: 0px; } @page { margin: 20mm 0px; } } .footnote-line { margin-top: 0.714em; font-size: 0.7em; } a img, img a { cursor: pointer; } pre.md-meta-block { font-size: 0.8rem; min-height: 0.8rem; white-space: pre-wrap; background: rgb(204, 204, 204); display: block; overflow-x: hidden; } -p > .md-image:only-child:not(.md-img-error) img, p > img:only-child { display: block; margin: auto; } -p > .md-image:only-child { display: inline-block; width: 100%; } +p > img:only-child { display: block; margin: auto; } +p > .md-image:only-child { display: inline-block; width: 100%; text-align: center; } #write .MathJax_Display { margin: 0.8em 0px 0px; } .md-math-block { width: 100%; } .md-math-block:not(:empty)::after { display: none; } [contenteditable="true"]:active, [contenteditable="true"]:focus { outline: 0px; box-shadow: none; } .md-task-list-item { position: relative; list-style-type: none; } .task-list-item.md-task-list-item { padding-left: 0px; } -.md-task-list-item > input { position: absolute; top: 0px; left: 0px; margin-left: -1.2em; margin-top: calc(1em - 10px); border: none; } +.md-task-list-item > input { position: absolute; top: 0px; left: 0px; margin-left: -1.2em; margin-top: calc(1em - 10px); } .math { font-size: 1rem; } .md-toc { min-height: 3.58rem; position: relative; font-size: 0.9rem; border-radius: 10px; } .md-toc-content { position: relative; margin-left: 0px; } @@ -131,7 +129,6 @@ code { text-align: left; vertical-align: initial; } a.md-print-anchor { white-space: pre !important; border-width: initial !important; border-style: none !important; border-color: initial !important; display: inline-block !important; position: absolute !important; width: 1px !important; right: 0px !important; outline: 0px !important; background: 0px 0px !important; text-decoration: initial !important; text-shadow: initial !important; } .md-inline-math .MathJax_SVG .noError { display: none !important; } -.html-for-mac .inline-math-svg .MathJax_SVG { vertical-align: 0.2px; } .md-math-block .MathJax_SVG_Display { text-align: center; margin: 0px; position: relative; text-indent: 0px; max-width: none; max-height: none; min-height: 0px; min-width: 100%; width: auto; overflow-y: hidden; display: block !important; } .MathJax_SVG_Display, .md-inline-math .MathJax_SVG_Display { width: auto; margin: inherit; display: inline-block !important; } .MathJax_SVG .MJX-monospace { font-family: var(--monospace); } @@ -222,7 +219,12 @@ } -:root { --active-file-bg-color: rgba(32, 43, 51, 0.63); --active-file-text-color: white; --bg-color: #f3f2ee; --text-color: #1f0909; --control-text-color: #444; --rawblock-edit-panel-bd: #e5e5e5; --select-text-bg-color: rgba(32, 43, 51, 0.63); --select-text-font-color: white; } +:root { --rawblock-edit-panel-bd: #e5e5e5; } +@font-face { font-family: "PT Serif"; font-style: normal; font-weight: normal; src: local("PT Serif"), local("PTSerif-Regular"), url("./newsprint/pt-serif-v9-latin-regular.woff") format("woff"); } +@font-face { font-family: "PT Serif"; font-style: italic; font-weight: normal; src: local("PT Serif"), local("PTSerif-Italic"), url("./newsprint/pt-serif-v9-latin-italic.woff") format("woff"); } +@font-face { font-family: "PT Serif"; font-style: normal; font-weight: bold; src: local("PT Serif"), local("PTSerif-Bold"), url("./newsprint/pt-serif-v9-latin-700.woff") format("woff"); } +@font-face { font-family: "PT Serif"; font-style: italic; font-weight: bold; src: local("PT Serif"), local("PTSerif-BoldItalic"), url("./newsprint/pt-serif-v9-latin-700italic.woff") format("woff"); } +:root { --active-file-bg-color: rgba(32, 43, 51, 0.63); --active-file-text-color: white; --bg-color: #f3f2ee; --text-color: #1f0909; --select-text-bg-color: rgba(32, 43, 51, 0.63); --select-text-font-color: white; } pre { --select-text-bg-color: #36284e; --select-text-font-color: #fff; } html { font-size: 16px; } html, body { background-color: rgb(243, 242, 238); font-family: "PT Serif", "Times New Roman", Times, serif; color: rgb(31, 9, 9); line-height: 1.5em; } @@ -251,21 +253,20 @@ thead th, tfoot th { padding: 0.25em 0.25em 0.25em 0.4em; text-transform: uppercase; } th { text-align: left; } td { vertical-align: top; padding: 0.25em 0.25em 0.25em 0.4em; } -code, .md-fences { background-color: rgb(218, 218, 218); } -code { padding-left: 2px; padding-right: 2px; } -.md-fences { margin-left: 2em; margin-bottom: 3em; padding-left: 1ch; padding-right: 1ch; } +code, .md-fences { background-color: rgb(218, 218, 218); padding-left: 1ch; padding-right: 1ch; } +.md-fences { margin-left: 2em; margin-bottom: 3em; } pre, code, tt { font-size: 0.875em; line-height: 1.71429em; } h1 { line-height: 1.3em; font-weight: normal; margin-bottom: 0.5em; } -p + ul, p + ol { margin-top: 0.5em; } +p + ul, p + ol { margin-top: -1em; } h3 + ul, h4 + ul, h5 + ul, h6 + ul, h3 + ol, h4 + ol, h5 + ol, h6 + ol { margin-top: 0.5em; } li > ul, li > ol { margin-top: inherit; margin-bottom: 0px; } -li ol > li { list-style-type: lower-alpha; } -li li ol > li { list-style-type: lower-roman; } +li li { list-style-type: lower-alpha; } +li li li { list-style-type: lower-roman; } +li > blockquote { margin-bottom: 0px; } h2, h3 { margin-bottom: 0.75em; } hr { border-top: none; border-right: none; border-bottom: 1px solid; border-left: none; } h1 { border-color: rgb(197, 197, 197); } blockquote { border-color: rgb(186, 186, 186); color: rgb(101, 101, 101); } -blockquote ul, blockquote ol { margin-left: 0px; } .ty-table-edit { background-color: transparent; } thead { background-color: rgb(218, 218, 218); } tr:nth-child(2n) { background: rgb(232, 231, 231); } @@ -322,17 +323,15 @@ input { border: 1px solid rgb(170, 170, 170); } .megamenu-menu-header #megamenu-menu-header-title, .megamenu-menu-header:hover, .megamenu-menu-header:focus { color: inherit; } .dropdown-menu .divider { border-color: rgb(229, 229, 229); } -.os-windows-7 strong, .os-windows-7 strong { font-weight: 760; } - .typora-export li, .typora-export p, .typora-export, .footnote-line {white-space: normal;} + .typora-export p, .typora-export .footnote-line {white-space: normal;} -
-In this assignment you are going to explore several well-know linear classification methods such as perceptron and logistic regression, and a realistic dataset will be provided for you to evaluate these methods.
To start with, we consider the least square model as an extension for the linear regression model to the problem of classification, and also the perceptron algorithm. You are given a simple linearly separable dataset containing 2 classes of points on a 2D plane, and you should build a model to correctly classify them.
You are provided with a function gen_linear_seperatable_2d_2c_dataset
in the handout file, you should import it to your solution as in the first assignment.
Requirements
Namely speaking, you should use least square model and the perceptron algorithm to learn two models to separate the dataset, and report the accuracy after you have learned the model.
Since the two models are linear, you should be able to draw a decision line on top of the dataset to visually show how can you separate the dataset. Include this plot in you report.
Part 2
In this part of the assignment, you are required to use logistic regression to do a simple text classification task. A function get_text_classification_datasets
is also provided, which returns the training and testing dataset, and you could explore the dataset by looking more closely to what it returns. Note that calling the function for the first time will takes several minutes to cache the dataset to your disk (the path is ../../..
which is parallel to the root of the project directory).
Text classification is about to classify a given document with its corresponding genre, e.g. sports, news, health etc. To represent the document in a convenient way to be processed by the logistic regression, we could represent the document in a multi-hot style vector.
Firstly you will have to tokenize the document into words (or a list of strings), you should firstly ignore all the characters in string.punctuation
and then make all string.whitespace
characters a space , and then split the document by all the spaces to have a list of strings. Then you build a vocabulary on all the words in the training dataset, which simply maps each word in the training dataset to a number. For example,
xdocs_toy = [
"""
Hi!
How are you?
""",
"""
Do you have a dog?
"""
]
When tokenized you will have
xxxxxxxxxx
['hi', 'how', 'are', 'you'],
['do', 'you', 'have', 'a', 'dog']
and your vocabulary will look like
xxxxxxxxxx
{'a': 0, 'are': 1, 'do': 2, 'dog': 3, 'have': 4, 'hi': 5, 'how': 6, 'you': 7}
and you use the vocabulary to map the tokenized document to a multi-hot vector!
xxxxxxxxxx
[0. 1. 0. 0. 0. 1. 1. 1.]
[1. 0. 1. 1. 1. 0. 0. 1.]
as you could verify this is the representation from the above two document.
Once you could represent the document in vectors (and also the category of the document in one-hot representation), then you can use the logistic regression!
Logistic regression is a kind of generalized linear model, the major (or only?) difference between logistic regression and least square is that in logistic regression we use a non-linear function after the linear transformation to enable probabilistic interpretation for the output. For binary classification, the logistic sigmoid function
transforms the unbounded prediction from the output of the linear model to a bounded interval , which could be interpreted as the probability of the prediction.
Note that logistic sigmoid function is just a special case of the following softmax function. Given a vector of the dimension , the softmax function computes the vector with the following components:
In other words, the softmax function first exponentiates the vector (elementwise) and then normalizes it such that all the components would add up to 1. The resulting vector can be interpreted as a probability distribution over the number of classes. We can then make prediction through selecting a class with the maximum associated probability:
One interesting property of the softmax function is that it is invariant to constant offsets, i.e. , where is a broadcasted vector of equal constant values. This means you could (and perhaps should) subtract the maximum from before to stabilize the numerical calculation.
To sum up, for logistic regression the predicted probability distributions are computed as:
Because the existence of the softmax function, we could not write a closed solution for the optimization, therefore we will use gradient descent to optimize a loss function, as is typically done in machine learning, we minimize a loss function on top of the prediction of the linear model. Concretely we use the cross entropy loss function defined as
where is the predicted probability of the correct class for the -th training example and is the total number of training examples. In order to reduce overfitting, an additional term penalizing large weights is added to the loss function as:
Stochastic gradient descent starts by taking the gradient of each parameter with respect to the loss, and then update the parameter with the gradient. Namely
where is a hyperparameter called learning rate that adjusts the magnitude of the weight updates.
Since the loss function of logistic regression is a convex function (as you could verify by differentiating the loss function twice), you are promised to get to the global minimum of the loss function with gradient descent.
To conclude, the requirements are:
Do not peek into the test dataset before this requirement!
In this assignment you are going to explore several well-know linear classification methods such as perceptron and logistic regression, and a realistic dataset will be provided for you to evaluate these methods.
To start with, we consider the least square model as an extension for the linear regression model to the problem of classification, and also the perceptron algorithm. You are given a simple linearly separable dataset containing 2 classes of points on a 2D plane, and you should build a model to correctly classify them.
You are provided with a function gen_linear_seperatable_2d_2c_dataset
in the handout file, you should import it to your solution as in the first assignment.
Requirements
Namely speaking, you should use least square model and the perceptron algorithm to learn two models to separate the dataset, and report the accuracy after you have learned the model.
Since the two models are linear, you should be able to draw a decision line on top of the dataset to visually show how can you separate the dataset. Include this plot in you report.
Part 2
In this part of the assignment, you are required to use logistic regression to do a simple text classification task. A function get_text_classification_datasets
is also provided, which returns the training and testing dataset, and you could explore the dataset by looking more closely to what it returns. Note that calling the function for the first time will takes several minutes to cache the dataset to your disk (the path is ../../..
which is parallel to the root of the project directory).
Text classification is about to classify a given document with its corresponding genre, e.g. sports, news, health etc. To represent the document in a convenient way to be processed by the logistic regression, we could represent the document in a multi-hot style vector.
Firstly you will have to tokenize the document into words (or a list of strings), you should firstly ignore all the characters in string.punctuation
and then make all string.whitespace
characters a space , and then split the document by all the spaces to have a list of strings. Subsequently, you should convert all characters into lowercase to facilitate future process. Then you build a vocabulary on all the words in the training dataset, which simply maps each word in the training dataset to a number. For example,
xxxxxxxxxx
docs_toy = [
"""
Hi!
How are you?
""",
"""
Do you have a dog?
"""
]
When tokenized you will have
xxxxxxxxxx
['hi', 'how', 'are', 'you'],
['do', 'you', 'have', 'a', 'dog']
and your vocabulary will look like
xxxxxxxxxx
{'a': 0, 'are': 1, 'do': 2, 'dog': 3, 'have': 4, 'hi': 5, 'how': 6, 'you': 7}
and you use the vocabulary to map the tokenized document to a multi-hot vector!
xxxxxxxxxx
[0. 1. 0. 0. 0. 1. 1. 1.]
[1. 0. 1. 1. 1. 0. 0. 1.]
as you could verify this is the representation from the above two document.
In practice, the vocabulary dictionary is quite large, which may cause the size of multi-hot vector exceeds the memory limits! To address this problem, you can set a frequency threshold min_count
and only consider the words which occur at least min_count
times in the overall training set. For this problem, min_count = 10
is suitable.
Once you could represent the document in vectors (and also the category of the document in one-hot representation), then you can use the logistic regression!
Logistic regression is a kind of generalized linear model, the major (or only?) difference between logistic regression and least square is that in logistic regression we use a non-linear function after the linear transformation to enable probabilistic interpretation for the output. For binary classification, the logistic sigmoid function
transforms the unbounded prediction from the output of the linear model to a bounded interval , which could be interpreted as the probability of the prediction.
Note that logistic sigmoid function is just a special case of the following softmax function. Given a vector of the dimension , the softmax function computes the vector with the following components:
In other words, the softmax function first exponentiates the vector (elementwise) and then normalizes it such that all the components would add up to 1. The resulting vector can be interpreted as a probability distribution over the number of classes. We can then make prediction through selecting a class with the maximum associated probability:
One interesting property of the softmax function is that it is invariant to constant offsets, i.e. , where is a broadcasted vector of equal constant values. This means you could (and perhaps should) subtract the maximum from before to stabilize the numerical calculation.
To sum up, for logistic regression the predicted probability distributions are computed as:
Because the existence of the softmax function, we could not write a closed solution for the optimization, therefore we will use gradient descent to optimize a loss function, as is typically done in machine learning, we minimize a loss function on top of the prediction of the linear model. Concretely we use the cross entropy loss function defined as
where is the predicted probability of the correct class for the -th training example and is the total number of training examples. In order to reduce overfitting, an additional term penalizing large weights is added to the loss function as:
Stochastic gradient descent starts by taking the gradient of each parameter with respect to the loss, and then update the parameter with the gradient. Namely
where is a hyperparameter called learning rate that adjusts the magnitude of the weight updates.
Since the loss function of logistic regression is a convex function (as you could verify by differentiating the loss function twice), you are promised to get to the global minimum of the loss function with gradient descent.
To conclude, the requirements are:
Do not peek into the test dataset before this requirement!