7174 lines
250 KiB
SQL
7174 lines
250 KiB
SQL
-- ============================================
|
||
-- PastPaper Master — COMP2211 problem-level questions
|
||
-- Seed Date: 2026-03-24
|
||
-- ============================================
|
||
--
|
||
-- Preconditions:
|
||
-- 1. Run comp2211_course_library_papers.sql first.
|
||
-- 2. Ensure those paper rows already exist in `papers`.
|
||
--
|
||
-- This seed inserts one row per top-level Problem.
|
||
-- It is intentionally coarse-grained and idempotent.
|
||
|
||
INSERT INTO paper_questions (
|
||
paper_id,
|
||
question_number,
|
||
parent_question,
|
||
display_order,
|
||
question_type,
|
||
question_format,
|
||
question_text,
|
||
score,
|
||
page_number,
|
||
options,
|
||
correct_option,
|
||
correct_answer,
|
||
raw_answer_text,
|
||
topics,
|
||
topic_primary,
|
||
analytics_topic,
|
||
topic_tags,
|
||
skill_tags,
|
||
difficulty,
|
||
knowledge_reminder,
|
||
ai_hint,
|
||
solution
|
||
)
|
||
SELECT
|
||
p.id,
|
||
seed.question_number,
|
||
seed.parent_question,
|
||
seed.display_order,
|
||
seed.question_type,
|
||
seed.question_format,
|
||
seed.question_text,
|
||
seed.score,
|
||
seed.page_number,
|
||
seed.options,
|
||
seed.correct_option,
|
||
seed.correct_answer,
|
||
seed.raw_answer_text,
|
||
seed.topics,
|
||
seed.topic_primary,
|
||
seed.analytics_topic,
|
||
seed.topic_tags,
|
||
seed.skill_tags,
|
||
seed.difficulty,
|
||
seed.knowledge_reminder,
|
||
seed.ai_hint,
|
||
seed.solution
|
||
FROM (
|
||
VALUES
|
||
('COMP2211-2022-fall-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [15 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 1.5 points for each correct answer.
|
||
(a) Machine learning gives computers the ability to make decision by writing down rules and
|
||
methods and being explicitly programmed.
|
||
(b) The regression technique in machine learning is NOT a group of algorithms that are used
|
||
for predicting a class/category.
|
||
(c) A 5-fold cross validation for K-nearest neighbors algorithm means that for each value
|
||
of K, we randomly select 1/5 of the training data as the validation set to evaluate the
|
||
model which is trained by the remaining (4/5) of the training data.
|
||
(d) If we use K-means clustering, we will get the same cluster assignments for each data
|
||
point, whether or not we standardize the variables.
|
||
(e) Given an input data point x, where all the attribute values in x are real numbers. Suppose
|
||
you are asked to predict a label y for x, where y = 0 or y = 1. Assume you have no
|
||
knowledge about the distributions of x and y, perceptron is an appropriate method for
|
||
this problem.
|
||
(f) A perceptron with the unit step function (i.e., f(z) = 0 if z ≤0, otherwise f(z) = 1) as
|
||
the activation function cannot be used for multi-class classification.
|
||
(g) If a training dataset is linearly separable into two classes, the perceptron learning rule
|
||
will always converge to weights and bias that accomplish the desired classification.
|
||
(h) The neural network weights are updated during forward propagation.
|
||
(i) An advantage of gradient descent-based methods, such as back-propagation, is that they
|
||
cannot get stuck in local minima.
|
||
(j) The back-propagation algorithm, when run until a minimum is achieved, always finds
|
||
the same solution (i.e., weights and biases) no matter what the initial set of weights and
|
||
biases are.
|
||
Question
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)
|
||
Answer', 15, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [15 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 1.5 points for each correct answer.
|
||
(a) Machine learning gives computers the ability to make decision by writing down rules and
|
||
methods and being explicitly programmed.
|
||
(b) The regression technique in machine learning is NOT a group of algorithms that are used
|
||
for predicting a class/category.
|
||
(c) A 5-fold cross validation for K-nearest neighbors algorithm means that for each value
|
||
of K, we randomly select 1/5 of the training data as the validation set to evaluate the
|
||
model which is trained by the remaining (4/5) of the training data.
|
||
(d) If we use K-means clustering, we will get the same cluster assignments for each data
|
||
point, whether or not we standardize the variables.
|
||
(e) Given an input data point x, where all the attribute values in x are real numbers. Suppose
|
||
you are asked to predict a label y for x, where y = 0 or y = 1. Assume you have no
|
||
knowledge about the distributions of x and y, perceptron is an appropriate method for
|
||
this problem.
|
||
(f) A perceptron with the unit step function (i.e., f(z) = 0 if z ≤0, otherwise f(z) = 1) as
|
||
the activation function cannot be used for multi-class classification.
|
||
(g) If a training dataset is linearly separable into two classes, the perceptron learning rule
|
||
will always converge to weights and bias that accomplish the desired classification.
|
||
(h) The neural network weights are updated during forward propagation.
|
||
(i) An advantage of gradient descent-based methods, such as back-propagation, is that they
|
||
cannot get stuck in local minima.
|
||
(j) The back-propagation algorithm, when run until a minimum is achieved, always finds
|
||
the same solution (i.e., weights and biases) no matter what the initial set of weights and
|
||
biases are.
|
||
Question
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)
|
||
Answer
|
||
F
|
||
T
|
||
F
|
||
F
|
||
F
|
||
T
|
||
T
|
||
F
|
||
F
|
||
F
|
||
Marking scheme:
|
||
1.5 points for each correct answer. 15 points in total', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-fall-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [19 points] Python Fundamentals
|
||
(a)
|
||
[7 points] Consider the following NumPy arrays:
|
||
import numpy as np
|
||
A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||
B = np.array([[[0, 1, 2, 3],
|
||
[4, 5, 6, 7],
|
||
[8, 9, 10, 11]],
|
||
[[12, 13, 14, 15],
|
||
[16, 17, 18, 19],
|
||
[20, 21, 22, 23]]])
|
||
Write the output for each of the following Python statements. If the output is an empty
|
||
array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(i) print(A[::3])
|
||
(ii) print(A[:-2:-2])
|
||
(iii) print(B[:, 1, 3:0:-1])
|
||
(iv) print(B[0, 1, [0,3]])
|
||
The NumPy array B is repeated here to ease your reading.
|
||
B = np.array([[[0, 1, 2, 3],
|
||
[4, 5, 6, 7],
|
||
[8, 9, 10, 11]],
|
||
[[12, 13, 14, 15],
|
||
[16, 17, 18, 19],
|
||
[20, 21, 22, 23]]])
|
||
(v) print(B[ B % 4 == 0 ])
|
||
(vi) # numpy.sum(a, axis)
|
||
# returns the sum of array elements over a given axis
|
||
print( np.sum( B, axis=1 ) )
|
||
(vii) # numpy.ndarray.reshape(shape)
|
||
# returns an array containing the same data with a new shape
|
||
print( B.reshape( (3, -1, 2) ) )
|
||
(b)
|
||
[6 points] Write the output for the following Python code segments. If the output is an
|
||
empty array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(i) import numpy as np
|
||
A = np.array([[1, 2, 3], [2, 4, 6]])
|
||
B = np.array([1, 2, 3])
|
||
print(A * B)
|
||
(ii) import numpy as np
|
||
A = np.array([1, 2, 3, 4])
|
||
B = np.array([[1, 2], [2, 4], [3, 6], [4, 8]])
|
||
print(A + B)
|
||
(iii) import numpy as np
|
||
A = np.array([1, 2, 3, 4])
|
||
B = np.array([[2], [3], [4]])
|
||
print(A + B)
|
||
(c)
|
||
[6 points] The cosine similarity of two non-zero vectors, xTrain = (xTrain
|
||
, . . . , xTrain
|
||
n
|
||
)
|
||
and xTest = (xTest
|
||
, . . . , xTest
|
||
n
|
||
), can be calculated by the following formula:
|
||
Pn
|
||
i=1 (xTrain
|
||
i
|
||
× xTest
|
||
i
|
||
)
|
||
qPn
|
||
i=1 (xTrain
|
||
i
|
||
)2
|
||
qPn
|
||
i=1 (xTest
|
||
i
|
||
)2
|
||
For example, if a training sample xTrain is (0, 1, 2) and a testing sample xTest is (4, 6,
|
||
8), the cosine similarity is:
|
||
0 × 4 + 1 × 6 + 2 × 8
|
||
√
|
||
02 + 12 + 22√
|
||
42 + 62 + 82 = 0.91350028
|
||
Given the following NumPy arrays, X_train and X_test, where each 1D array represents
|
||
a data point:
|
||
import numpy as np
|
||
X_train = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]])
|
||
X_test = np.array([[4, 6, 8], [5, 0, 0]])
|
||
Compute the cosine similarity scores between each data point in X_train and each data
|
||
point in X_test with a one-line Python expression, such that the evaluated result of
|
||
the expression is:
|
||
[[0.91350028 0.
|
||
]
|
||
[1.
|
||
0.37139068]
|
||
[0.99461155 0.45584231]]
|
||
N ote:
|
||
An expression is a combination of values, variables, operators, and calls to functions.
|
||
Your expression should work with any number of data points in X_train and X_test
|
||
and any number of values in the data points.
|
||
You can assume that the number of attribute values in each data point is the same
|
||
for both X_train and X_test.
|
||
There must be no explicit loops in your expression.
|
||
You may find the following attribute or functions useful for this question.
|
||
Dot product of two arrays, a and b:
|
||
numpy.dot(a, b)
|
||
– It returns the product of matrix multiplication.
|
||
Return the element-wise square of an array, x
|
||
numpy.square(x)
|
||
Return the sum of array elements over a given axis.
|
||
numpy.sum(a, axis=None)
|
||
– a: the input array with elements to sum.
|
||
– axis: None or int or tuple of ints
|
||
Return the non-negative square-root of an array, element-wise.
|
||
numpy.sqrt(x)
|
||
– x: the input array with values whose square-roots are required.
|
||
Insert a new axis that will appear at the axis position in the expanded array shape.
|
||
numpy.expand_dims(a, axis)
|
||
– a: the input array.
|
||
– axis: an int or tuple of ints that represents position in the expanded axes where
|
||
the new axis (or axes) is placed.
|
||
The transposed array.
|
||
numpy.ndarray.T
|
||
Write the one-line Python expression below:
|
||
print(
|
||
)', 19, 3, NULL::jsonb, NULL, NULL, 'Problem 2 [19 points] Python Fundamentals
|
||
(a)
|
||
[7 points] Consider the following NumPy arrays:
|
||
Marking scheme:
|
||
1 point each sub-questions, i.e. (i) to (vii)
|
||
No partial score
|
||
import numpy as np
|
||
A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||
B = np.array([[[0, 1, 2, 3],
|
||
[4, 5, 6, 7],
|
||
[8, 9, 10, 11]],
|
||
[[12, 13, 14, 15],
|
||
[16, 17, 18, 19],
|
||
[20, 21, 22, 23]]])
|
||
Write the output for each of the following Python statements. If the output is an empty
|
||
array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(i) print(A[::3])
|
||
Answer:
|
||
[1 4 7]
|
||
(ii) print(A[:-2:-2])
|
||
Answer:
|
||
[9]
|
||
(iii) print(B[:, 1, 3:0:-1])
|
||
Answer:
|
||
[[ 7
|
||
5]
|
||
[19 18 17]]
|
||
(iv) print(B[0, 1, [0,3]])
|
||
Answer:
|
||
[4 7]
|
||
(v) print(B[ B % 4 == 0 ])
|
||
Answer:
|
||
[ 0
|
||
8 12 16 20]
|
||
(vi) # numpy.sum(a, axis)
|
||
# returns the sum of array elements over a given axis
|
||
print( np.sum( B, axis=1 ) )
|
||
Answer:
|
||
[[12 15 18 21]
|
||
[48 51 54 57]]
|
||
(vii) # numpy.ndarray.reshape(shape)
|
||
# returns an array containing the same data with a new shape
|
||
print( B.reshape( (3, -1, 2) ) )
|
||
Answer:
|
||
[[[ 0
|
||
1]
|
||
[ 2
|
||
3]
|
||
[ 4
|
||
5]
|
||
[ 6
|
||
7]]
|
||
[[ 8
|
||
9]
|
||
[10 11]
|
||
[12 13]
|
||
[14 15]]
|
||
[[16 17]
|
||
[18 19]
|
||
[20 21]
|
||
[22 23]]]
|
||
(b)
|
||
[6 points] Write the output for the following Python code segments. If the output is an
|
||
empty array, write “Empty Array”. If an error occurs, write “Error”.
|
||
Marking scheme:
|
||
2 points each sub-questions, i.e. (i) to (iii)
|
||
No partial score
|
||
(i) import numpy as np
|
||
A = np.array([[1, 2, 3], [2, 4, 6]])
|
||
B = np.array([1, 2, 3])
|
||
print(A * B)
|
||
Answer:
|
||
[[ 1
|
||
9]
|
||
[ 2
|
||
8 18]]
|
||
(ii) import numpy as np
|
||
A = np.array([1, 2, 3, 4])
|
||
B = np.array([[1, 2], [2, 4], [3, 6], [4, 8]])
|
||
print(A + B)
|
||
Answer:
|
||
Error
|
||
(iii) import numpy as np
|
||
A = np.array([1, 2, 3, 4])
|
||
B = np.array([[2], [3], [4]])
|
||
print(A + B)
|
||
Answer:
|
||
[[3 4 5 6]
|
||
[4 5 6 7]
|
||
[5 6 7 8]]
|
||
(c)
|
||
[6 points] The cosine similarity of two non-zero vectors, xTrain = (xTrain
|
||
, . . . , xTrain
|
||
n
|
||
)
|
||
and xTest = (xTest
|
||
, . . . , xTest
|
||
n
|
||
), can be calculated by the following formula:
|
||
Pn
|
||
i=1 (xTrain
|
||
i
|
||
× xTest
|
||
i
|
||
)
|
||
qPn
|
||
i=1 (xTrain
|
||
i
|
||
)2
|
||
qPn
|
||
i=1 (xTest
|
||
i
|
||
)2
|
||
For example, if a training sample xTrain is (0, 1, 2) and a testing sample xTest is (4, 6,
|
||
8), the cosine similarity is:
|
||
0 × 4 + 1 × 6 + 2 × 8
|
||
√
|
||
02 + 12 + 22√
|
||
42 + 62 + 82 = 0.91350028
|
||
Given the following NumPy arrays, X_train and X_test, where each 1D array represents
|
||
a data point:
|
||
import numpy as np
|
||
X_train = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]])
|
||
X_test = np.array([[4, 6, 8], [5, 0, 0]])
|
||
Compute the cosine similarity scores between each data point in X_train and each data
|
||
point in X_test with a one-line Python expression, such that the evaluated result of
|
||
the expression is:
|
||
[[0.91350028 0.
|
||
]
|
||
[1.
|
||
0.37139068]
|
||
[0.99461155 0.45584231]]
|
||
N ote:
|
||
An expression is a combination of values, variables, operators, and calls to functions.
|
||
Your expression should work with any number of data points in X_train and X_test
|
||
and any number of values in the data points.
|
||
You can assume that the number of attribute values in each data point is the same
|
||
for both X_train and X_test.
|
||
There must be no explicit loops in your expression.
|
||
You may find the following attribute or functions useful for this question.
|
||
Dot product of two arrays, a and b:
|
||
numpy.dot(a, b)
|
||
– It returns the product of matrix multiplication.
|
||
Return the element-wise square of an array, x
|
||
numpy.square(x)
|
||
Return the sum of array elements over a given axis.
|
||
numpy.sum(a, axis=None)
|
||
– a: the input array with elements to sum.
|
||
– axis: None or int or tuple of ints
|
||
Return the non-negative square-root of an array, element-wise.
|
||
numpy.sqrt(x)
|
||
– x: the input array with values whose square-roots are required.
|
||
Insert a new axis that will appear at the axis position in the expanded array shape.
|
||
numpy.expand_dims(a, axis)
|
||
– a: the input array.
|
||
– axis: an int or tuple of ints that represents position in the expanded axes where
|
||
the new axis (or axes) is placed.
|
||
The transposed array.
|
||
numpy.ndarray.T
|
||
Write the one-line Python expression below:
|
||
print(
|
||
)
|
||
Answer:
|
||
print(X_train.dot(X_test.T) /
|
||
np.dot(np.expand_dims(np.sqrt(np.sum(X_train**2, axis=1)), 1),
|
||
np.expand_dims(np.sqrt(np.sum(X_test**2, axis=1)), 1).T)
|
||
)
|
||
print(X_train.dot(X_test.T) /
|
||
np.dot(np.expand_dims(np.sqrt(np.sum(X_train**2, axis=1)), 1),
|
||
np.expand_dims(np.sqrt(np.sum(X_test**2, axis=1)), 0))
|
||
)
|
||
print(np.dot(X_train, X_test.transpose()) /
|
||
(np.sqrt(np.sum(np.square(X_train),axis=1)[:, None]) *
|
||
np.sqrt(np.sum(np.square(X_test), axis=1)[None, :]))
|
||
)
|
||
print(np.dot(X_train, np.transpose(X_test)) /
|
||
(np.sqrt(np.sum(np.square(X_train),axis=1)[:, np.newaxis]) *
|
||
np.sqrt(np.sum(np.square(X_test), axis=1)[np.newaxis,:]))
|
||
)
|
||
print(np.dot(X_train, X_test.T) /
|
||
np.sqrt(np.sum(np.square(X_train), axis=1)).reshape(3,1).dot(
|
||
np.sqrt(np.sum(np.square(X_test), axis=1)).reshape(1, 2))
|
||
)
|
||
Marking scheme:
|
||
2 points for correct numerator
|
||
– If not using np.dot correctly, deduct 1 point (np.dot can be replaced by np.matmul,
|
||
@);
|
||
– If not using np.ndarray.T correctly, deduct 1 point (np.ndarray.T can be replaced
|
||
by np.transpose);
|
||
– The maximum number of points can be deducted for numerator is 2.
|
||
4 points for correct denominator
|
||
– If not using np.expand dims correctly (including specifying axis), deduct 1 point
|
||
for each (np.expand dims can be replaced by np.reshape or adding a new axis);
|
||
– If not using np.dot correctly, deduct 1 point (np.dot can be replaced by np.matmul,
|
||
@);
|
||
– If not using np.sqrt correctly, deduct 1 point for each;
|
||
– If not using np.sum correcly (including specifying axis), deduct 1 point for each;
|
||
– If not using np.square correctly, deduct 1 point for each (np.square can be re-
|
||
placed by np.ndarray ** 2);
|
||
– The maximum number of points can be deducted for denominator is 4. If points
|
||
to be deducted exceed 4, the denominator part gets 0 point.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2022-fall-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [18 points] Conditional Probability and Bayes Classifier
|
||
(a)
|
||
[4 points] Assume the probability of getting an A+ in COMP 2211 is 0.08. The prob-
|
||
ability of getting a score greater than 90 in the midterm exam given that a student
|
||
gets an A+ in COMP 2211 is 0.95, and the probability of getting a score greater than
|
||
90 in the midterm exam given that the student does not get an A+ in COMP 2211 is 0.05.
|
||
(i) Calculate the probability of getting a score greater than 90 in the midterm exam.
|
||
Show all your steps. Round your answer to 2 decimal places.
|
||
The formula that you may find useful for this question:
|
||
P(E) = P(E|B)P(B) + P(E|Not B)P(Not B)
|
||
(ii) Use Bayes’ rule to calculate the probability of getting an A+ given that a student
|
||
gets a score greater than 90 in the midterm exam. Show all your steps. Round your
|
||
answer to 2 decimal places.
|
||
The formula that you may find useful for this question:
|
||
P(B|E) = P(B)P(E|B)
|
||
P(E)
|
||
(b)
|
||
[8 points] Given the following dataset (X, y) with 10 training examples, each contains
|
||
2 attributes (x1, x2) and its binary label y.
|
||
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
|
||
y = [0 0 1 1 1 1 0 0 1 1]
|
||
Suppose you believe that a naive Bayes model would be appropriate for this dataset, and
|
||
you want to classify the test sample:
|
||
x = [1 1]
|
||
(i) Compute the class prior probabilities, i.e., P(y = 0) and P(y = 1).
|
||
(ii) Compute the 4 conditional probabilities required by naive Bayes for the test sample,
|
||
i.e., P(x1 = 1|y = 0), P(x2 = 1|y = 0), P(x1 = 1|y = 1), P(x2 = 1|y = 1).
|
||
(iii) Under the naive Bayes model and the probabilities you compute in parts b(i) and
|
||
b(ii), what is the most likely label for the test sample, i.e., 0 or 1? If there is a tie,
|
||
please include the word “tie” in your answer. Show all your steps.
|
||
The formula that you may find useful for this question:
|
||
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) . . . P(ed|Bi))
|
||
(c)
|
||
[3 points] Briefly describe a zero frequency problem of Naive Bayes classification and
|
||
suggest a way to solve the problem. Also, state whether a zero frequency problem occurs
|
||
in part (b).
|
||
The dataset (X, y) is repeated here to ease your reading.
|
||
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
|
||
y = [0 0 1 1 1 1 0 0 1 1]
|
||
(d)
|
||
[3 points] Consider the dataset given in part (b), i.e., the one above. Suppose you do
|
||
NOT believe that the naive Bayes model would be appropriate for this dataset, and you
|
||
want to classify the test sample:
|
||
x = [0 0]
|
||
using the Bayes model without making the naive assumption. What is the most
|
||
likely label for the test sample? If there is a tie, please include the word “tie” in your
|
||
answer. Show all your steps.
|
||
The formula that you may find useful for this question:
|
||
BNB = argmaxBiP(Bi)P((e1, e2, e3, . . . , ed)|Bi)', 18, 8, NULL::jsonb, NULL, NULL, 'Problem 3 [18 points] Conditional Probability and Bayes Classifier
|
||
(a)
|
||
[4 points] Assume the probability of getting an A+ in COMP 2211 is 0.08. The prob-
|
||
ability of getting a score greater than 90 in the midterm exam given that a student
|
||
gets an A+ in COMP 2211 is 0.95, and the probability of getting a score greater than
|
||
90 in the midterm exam given that the student does not get an A+ in COMP 2211 is 0.05.
|
||
(i) [3 points] Calculate the probability of getting a score greater than 90 in the midterm
|
||
exam. Show all your steps. Round your answer to 2 decimal places.
|
||
The formula that you may find useful for this question:
|
||
P(E) = P(E|B)P(B) + P(E|Not B)P(Not B)
|
||
Answer:
|
||
P(A+) =0.08
|
||
P(> 90|A+) =0.95
|
||
P(> 90|Not A+) =0.05
|
||
P(> 90) =P(> 90|A+)P(A+) + P(> 90|Not A+)P(Not A+)
|
||
=0.95(0.08) + 0.05(0.92) = 0.12
|
||
Marking scheme:
|
||
0.5 point for P(A+) = 0.08
|
||
0.5 point for P(> 90|A+) = 0.95
|
||
0.5 point for P(> 90|Not A+) = 0.05
|
||
0.5 point for P(Not A+) = 0.92
|
||
1 point for P(> 90) = 0.12
|
||
(ii)
|
||
[1 point] Use Bayes’ rule to calculate the probability of getting an A+ given that
|
||
a student gets a score greater than 90 in the midterm exam. Show all your steps.
|
||
Round your answer to 2 decimal places.
|
||
The formula that you may find useful for this question:
|
||
P(B|E) = P(B)P(E|B)
|
||
P(E)
|
||
Answer:
|
||
P(A + | > 90) = P(A+)P(> 90|A+)
|
||
P(> 90)
|
||
= 0.08(0.95)
|
||
0.12
|
||
= 0.63
|
||
Marking scheme:
|
||
1 point for P(A + | > 90) = 0.63
|
||
(b)
|
||
[8 points] Given the following dataset (X, y) with 10 training examples, each contains
|
||
2 attributes (x1, x2) and its binary label y.
|
||
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
|
||
y = [0 0 1 1 1 1 0 0 1 1]
|
||
Suppose you believe that a naive Bayes model would be appropriate for this dataset, and
|
||
you want to classify the test sample:
|
||
x = [1 1]
|
||
(i)
|
||
[1 point] Compute the class prior probabilities, i.e., P(y = 0) and P(y = 1).
|
||
Answer:
|
||
P(y = 0) = 4
|
||
10 = 2
|
||
P(y = 1) = 6
|
||
10 = 3
|
||
Marking scheme:
|
||
0.5 point for P(y = 0) = 2
|
||
0.5 point for P(y = 1) = 3
|
||
(ii)
|
||
[4 points] Compute the 4 conditional probabilities required by naive Bayes for
|
||
the test sample, i.e., P(x1 = 1|y = 0), P(x2 = 1|y = 0), P(x1 = 1|y = 1),
|
||
P(x2 = 1|y = 1).
|
||
Answer:
|
||
P(x1 = 1|y = 0) = 0
|
||
P(x2 = 1|y = 0) = 3
|
||
P(x1 = 1|y = 1) = 5
|
||
P(x2 = 1|y = 1) = 3
|
||
6 = 1
|
||
Marking scheme:
|
||
1 point for P(x1 = 1|y = 0) = 0
|
||
1 point for P(x2 = 1|y = 0) = 3
|
||
1 point for P(x1 = 1|y = 1) = 5
|
||
1 point for P(x2 = 1|y = 1) = 1
|
||
(iii)
|
||
[3 points] Under the naive Bayes model and the probabilities you compute in parts
|
||
b(i) and b(ii), what is the most likely label for the test sample, i.e., 0 or 1? If there
|
||
is a tie, please include the word “tie” in your answer. Show all your steps.
|
||
The formula that you may find useful for this question:
|
||
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) . . . P(ed|Bi))
|
||
Answer:
|
||
The numerator part of P(y = 0|x) = P(y = 0)P(x1 = 1|y = 0)P(x2 = 1|y = 0)
|
||
= 0
|
||
The numerator part of P(y = 1|x) = P(y = 1)P(x1 = 1|y = 1)P(x2 = 1|y = 1)
|
||
= 3
|
||
5
|
||
1
|
||
|
||
= 1
|
||
So, the most likely label for the test sample is 1.
|
||
Marking scheme:
|
||
1 point for the numerator part of P(y = 0|x) = 0
|
||
1 point for the numerator part of P(y = 1|x) = 1
|
||
1 point for stating the most likely label for the test sample is 1
|
||
(c)
|
||
[3 points] Briefly describe a zero frequency problem of Naive Bayes classification and
|
||
suggest a way to solve the problem. Also, state whether a zero frequency problem occurs
|
||
in part (b).
|
||
Answer:
|
||
If categorical variable has a category in test data set, which was not observed in the
|
||
training data set, then the frequency-based probability estimate will be zero. And we
|
||
will get a zero when all the probabilities are multiplied. This will be unable to make a
|
||
prediction. This is known as zero frequency.
|
||
A way to overcome this “zero frequency problem” is to add one to the count for ev-
|
||
ery attribute value-class combination when an attribute value does not occur with every
|
||
class value.
|
||
A zero frequency problem occurs in part (b).
|
||
Marking scheme:
|
||
1 point for briefly describing a zero frequency problem
|
||
1 point for suggesting a way to solve the problem
|
||
*** Please note that suggesting “adding more training data/increase the dataset
|
||
size/Using Log()” will not get any mark ***
|
||
1 point for stating a zero frequency problem occurs in part (b)
|
||
The dataset (X, y) is repeated here to ease your reading.
|
||
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
|
||
y = [0 0 1 1 1 1 0 0 1 1]
|
||
(d)
|
||
[3 points] Consider the dataset given in part (b), i.e., the one above. Suppose you do
|
||
NOT believe that the naive Bayes model would be appropriate for this dataset, and you
|
||
want to classify the test sample:
|
||
x = [0 0]
|
||
using the Bayes model without making the naive assumption. What is the most
|
||
likely label for the test sample? If there is a tie, please include the word “tie” in your
|
||
answer. Show all your steps.
|
||
The formula that you may find useful for this question:
|
||
BNB = argmaxBiP(Bi)P((e1, e2, e3, . . . , ed)|Bi)
|
||
Answer:
|
||
The numerator part of P(y = 0|x) = P(y = 0)P((x1 = 0, x2 = 0)|y = 0)
|
||
=
|
||
2
|
||
1
|
||
|
||
= 1
|
||
The numerator part of P(y = 1|x) = P(y = 1)P((x1 = 0, x2 = 0)|y = 1)
|
||
=
|
||
3
|
||
1
|
||
|
||
= 1
|
||
As the two probabilities are the same value, it’s a tie.
|
||
Marking scheme:
|
||
1 point for the numerator part of P(y = 0|x) = 1
|
||
1 point for the numerator part of P(y = 1|x) = 1
|
||
1 point for stating it’s a tie', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2022-fall-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors
|
||
Consider a set of 8 training data given as (xTrain, cTrain) values, where xTrain is a string with
|
||
6 bits and cTrain is the multi-class label, A, B or C:
|
||
{ ("101110", A), ("110100", B), ("011100", B), ("000101", B),
|
||
("011111", B), ("101101", B), ("100011", A), ("010000", C) }
|
||
Classify a test sample (xTest) with the string “101011” using a K-Nearest Neighbors classifier
|
||
(KNN) and Hamming distance.
|
||
Hamming distance is the number of bit positions in which the two bits are different. For
|
||
example, the hamming distance between two strings, “11011001” and “10011101” is 2.
|
||
(a)
|
||
[5 points] Complete the following table by filling in the computed Hamming distance
|
||
between each training data point and the test sample. Also, determine the class label of
|
||
the test sample using KNN with K = 3 based on the results.
|
||
xTrain
|
||
cTrain
|
||
Distance
|
||
“101110”
|
||
A
|
||
“110100”
|
||
B
|
||
“011100”
|
||
B
|
||
“000101”
|
||
B
|
||
“011111”
|
||
B
|
||
“101101”
|
||
B
|
||
“100011”
|
||
A
|
||
“010000”
|
||
C
|
||
The predicted class label of the test sample is
|
||
.
|
||
(b)
|
||
[3 points] What will happen if we use a KNN classifier with K = 4 instead to classify
|
||
the test sample above? Describe your answer.
|
||
(c)
|
||
[3 points] A student claims that if a KNN classifier with K = 3 is used with the above
|
||
training data, the predicted class label will never be class C no matter what the test
|
||
example is, if there is no tie-breaking strategy used. Is this claim correct or not? Explain
|
||
your answer.
|
||
(d)
|
||
[3 points] Is the above training data suitable for performing a D-fold cross-validation?
|
||
Explain your answer.', 14, 12, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors
|
||
Consider a set of 8 training data given as (xTrain, cTrain) values, where xTrain is a string with
|
||
6 bits and cTrain is the multi-class label, A, B or C:
|
||
{ ("101110", A), ("110100", B), ("011100", B), ("000101", B),
|
||
("011111", B), ("101101", B), ("100011", A), ("010000", C) }
|
||
Classify a test sample (xTest) with the string “101011” using a K-Nearest Neighbors classifier
|
||
(KNN) and Hamming distance.
|
||
Hamming distance is the number of bit positions in which the two bits are different. For
|
||
example, the hamming distance between two strings, “11011001” and “10011101” is 2.
|
||
(a)
|
||
[5 points] Complete the following table by filling in the computed Hamming distance
|
||
between each training data point and the test sample. Also, determine the class label of
|
||
the test sample using KNN with K = 3 based on the results.
|
||
xTrain
|
||
cTrain
|
||
Distance
|
||
“101110”
|
||
A
|
||
“110100”
|
||
B
|
||
“011100”
|
||
B
|
||
“000101”
|
||
B
|
||
“011111”
|
||
B
|
||
“101101”
|
||
B
|
||
“100011”
|
||
A
|
||
“010000”
|
||
C
|
||
The predicted class label of the test sample is
|
||
.
|
||
Answer:
|
||
xTrain
|
||
cTrain
|
||
Distance
|
||
“101110”
|
||
A
|
||
“110100”
|
||
B
|
||
“011100”
|
||
B
|
||
“000101”
|
||
B
|
||
“011111”
|
||
B
|
||
“101101”
|
||
B
|
||
“100011”
|
||
A
|
||
“010000”
|
||
C
|
||
The predicted class label of the test sample is A.
|
||
Marking scheme:
|
||
0.5 point for each distance, i.e. 4 points total
|
||
1 point for the predicted class label
|
||
(b)
|
||
[3 points] What will happen if we use a KNN classifier with K = 4 instead to classify
|
||
the test sample above? Describe your answer.
|
||
Answer:
|
||
We will have a tie where 2 nearest neighbors are with class label A and 2 nearest neighbors
|
||
are with class label B.
|
||
Marking scheme:
|
||
1 point for mentioning “Tie” or ”cannot find the class label”
|
||
2 points for the description of “2 NNs are class label A and 2 NNs are class label B”
|
||
(c)
|
||
[3 points] A student claims that if a KNN classifier with K = 3 is used with the above
|
||
training data, the predicted class label will never be class C no matter what the test
|
||
example is, if there is no tie-breaking strategy used. Is this claim correct or not? Explain
|
||
your answer.
|
||
Answer:
|
||
The claim is correct. If k = 3 and assume equal weights for all the points, no mat-
|
||
ter what the testing example is, it will be one of the following 3 cases:
|
||
none of the 3 nearest neighbors is with class label C, so the predicted label will not
|
||
be C.
|
||
1 of the 3 nearest neighbors is with class label C and the other 2 neighbors are both
|
||
with class label A, or both with class label B, so the predicted class label will only
|
||
be A or B.
|
||
A tie
|
||
Marking scheme:
|
||
1 point for “correct” or “yes”
|
||
2 points for the explanation
|
||
(d)
|
||
[3 points] Is the above training data suitable for performing a D-fold cross-validation?
|
||
Explain your answer.
|
||
Answer:
|
||
No, unless the number of folds is equal to the number of training data points.
|
||
The
|
||
above training data set is umbalanced. Hence, if we split the above training data set, the
|
||
data used for training may not have samples with class labels A or C.
|
||
Marking scheme:
|
||
1 point for “no”
|
||
2 points for the explanation', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-fall-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [12 points] K-Means Clustering
|
||
Consider the following training dataset:
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Attribute value
|
||
0.2
|
||
0.5
|
||
0.7
|
||
2.5
|
||
3.5
|
||
Apply the K-means clustering algorithm to this dataset for K=2, i.e., you will produce two
|
||
data clusters. Assume the distance measure you use is Euclidean distance, defined as follows.
|
||
distance(xTrain, xTest) =
|
||
v
|
||
u
|
||
u
|
||
t
|
||
n
|
||
X
|
||
i=1
|
||
(xTrain
|
||
i
|
||
−xTest
|
||
i
|
||
)2
|
||
(a) [1.5 points] Assume examples A and B are chosen as the initial centroid of clusters 1 and
|
||
cluster 2, respectively. Write down the resulting cluster assignments in the table below.
|
||
The cluster assignment for A and B has been done for you.
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Cluster Assignment
|
||
(b) [2 points] After assigning the examples to the clusters in part (a), re-compute the cluster
|
||
centroids to be the mean of the examples currently assigned to each cluster. For each
|
||
cluster, write the new cluster centroid in the table below.
|
||
Cluster
|
||
Centroid
|
||
(c)
|
||
[2.5 points] After recomputing the cluster centroids in part (b), re-assign the examples
|
||
to the clusters to which they are closest. Write down the resulting cluster assignments
|
||
in the table below.
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Cluster Assignment
|
||
(d) [2 points] After assigning the examples to the clusters in part (c), re-compute the cluster
|
||
centroids to be the mean of the examples currently assigned to each cluster. For each
|
||
cluster, write the new cluster centroid in the table below. If your answer is a floating
|
||
point number, round it to 2 decimal places.
|
||
Cluster
|
||
Centroid
|
||
(e)
|
||
[2.5 points] After recomputing the cluster centroids in part (d), re-assign the examples
|
||
to the clusters to which they are closest. Write down the resulting cluster assignments
|
||
in the table below.
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Cluster Assignment
|
||
(f)
|
||
[1.5 points] State whether the K-means clustering algorithm converges for this dataset
|
||
after performing all the steps in parts (a)-(e). Explain your answer.', 12, 14, NULL::jsonb, NULL, NULL, 'Problem 5 [12 points] K-Means Clustering
|
||
Consider the following training dataset:
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Attribute value
|
||
0.2
|
||
0.5
|
||
0.7
|
||
2.5
|
||
3.5
|
||
Apply the K-means clustering algorithm to this dataset for K=2, i.e., you will produce two
|
||
data clusters. Assume the distance measure you use is Euclidean distance, defined as follows.
|
||
distance(xTrain, xTest) =
|
||
v
|
||
u
|
||
u
|
||
t
|
||
n
|
||
X
|
||
i=1
|
||
(xTrain
|
||
i
|
||
−xTest
|
||
i
|
||
)2
|
||
(a) [1.5 points] Assume examples A and B are chosen as the initial centroid of clusters 1 and
|
||
cluster 2, respectively. Write down the resulting cluster assignments in the table below.
|
||
The cluster assignment for A and B has been done for you.
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Cluster Assignment
|
||
Marking scheme:
|
||
0.5 point for each correct cluster assignment. 1.5 points in total
|
||
(b) [2 points] After assigning the examples to the clusters in part (a), re-compute the cluster
|
||
centroids to be the mean of the examples currently assigned to each cluster. For each
|
||
cluster, write the new cluster centroid in the table below.
|
||
Cluster
|
||
Centroid
|
||
0.2
|
||
1.8
|
||
Marking scheme:
|
||
0.5 point for each correct centroid. 1 point in total
|
||
(c)
|
||
[2.5 points] After recomputing the cluster centroids in part (b), re-assign the examples
|
||
to the clusters to which they are closest. Write down the resulting cluster assignments
|
||
in the table below.
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Cluster Assignment
|
||
Marking scheme:
|
||
0.5 point for each correct cluster assignment. 2.5 points in total
|
||
(d) [2 points] After assigning the examples to the clusters in part (c), re-compute the cluster
|
||
centroids to be the mean of the examples currently assigned to each cluster. For each
|
||
cluster, write the new cluster centroid in the table below. If your answer is a floating
|
||
point number, round it to 2 decimal places.
|
||
Cluster
|
||
Centroid
|
||
0.47
|
||
Marking scheme:
|
||
1 point for each correct centroid. 2 points in total
|
||
(e)
|
||
[2.5 points] After recomputing the cluster centroids in part (d), re-assign the examples
|
||
to the clusters to which they are closest. Write down the resulting cluster assignments
|
||
in the table below.
|
||
Example
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Cluster Assignment
|
||
Marking scheme:
|
||
0.5 point for each correct cluster assignment. 2.5 points in total
|
||
(f)
|
||
[1.5 points] State whether the K-means clustering algorithm converges for this dataset
|
||
after performing all the steps in parts (a)-(e). Explain your answer.
|
||
Answer:
|
||
The algorithm converges for this dataset since the cluster assignment of examples re-
|
||
mains the same.
|
||
Marking scheme:
|
||
0.5 point for stating the algorithm converges
|
||
1 point for the correct explanation', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-fall-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [14 points] Perceptron
|
||
Suppose you are a CSE professor at HKUST who wants to offer a new course, COMP 2211.
|
||
Before starting, you want to predict if COMP 2211 will be successful or not. You approach
|
||
two students, A and B, asking them to read the proposal of 5 existing courses, COMP 1111,
|
||
COMP 2222, COMP 3333, COMP 4444, and COMP 5555, and rate them on a scale of 1
|
||
to 5 (assume only integer scores). Assume each student reads the proposals independently
|
||
and announces their rating. As each student might be biased, you may be unable to average
|
||
their ratings. Instead, you decide to use a perceptron to classify the data using the training
|
||
dataset in Table 1.
|
||
Course Code
|
||
x1
|
||
x2
|
||
Success
|
||
COMP 1111
|
||
No
|
||
COMP 2222
|
||
Yes
|
||
COMP 3333
|
||
No
|
||
COMP 4444
|
||
Yes
|
||
COMP 5555
|
||
Yes
|
||
Table 1: Training dataset
|
||
The training dataset is composed of the two student scores, x1 = score given by student A,
|
||
and x2 = score given by student B, and the corresponding true label regarding the success
|
||
of the courses judged based on the SFQ, for the 5 courses.
|
||
(a) [8 points] Train the perceptron to generate O = 1 if the course is success, O = 0 otherwise,
|
||
using the initial weight of w1 = 0, w2 = 0, θ = -1, and the activation function:
|
||
f(z) =
|
||
|
||
|
||
|
||
if z ≤0
|
||
otherwise
|
||
Also, use a learning rate of η = 1.
|
||
You may find the updating rules below useful.
|
||
∆wi = η(T −O)xi
|
||
∆θ = η(T −O)
|
||
wi = wi + ∆wi
|
||
θ = θ + ∆θ
|
||
Present each row of Table 1 as a training example, and update the perceptron weights
|
||
and bias before moving on to the next row. Show all the steps by completing Table 2.
|
||
x1
|
||
x2
|
||
T
|
||
O
|
||
∆w1
|
||
w1
|
||
∆w2
|
||
w2
|
||
∆θ
|
||
θ
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-1
|
||
Table 2: Perceptron algorithm execution
|
||
(b)
|
||
[6 points] Now, assume you do not care about the true label regarding the success of
|
||
the courses judged based on the SFQ. State whether each of the following scenarios for
|
||
which a perceptron using the ratings of the two students can correctly classify the data.
|
||
Justify your answer.
|
||
(i) If the total of their ratings is more than 8, then the course will be success and
|
||
otherwise it will fail, i.e.,
|
||
Course Code
|
||
x1
|
||
x2
|
||
Success
|
||
COMP 1111
|
||
No
|
||
COMP 2222
|
||
No
|
||
COMP 3333
|
||
Yes
|
||
COMP 4444
|
||
No
|
||
COMP 5555
|
||
No
|
||
(ii) The course will succeed if and only if each reviewer gives either a rating of 2 or a
|
||
rating of 3, i.e.,
|
||
Course Code
|
||
x1
|
||
x2
|
||
Success
|
||
COMP 1111
|
||
No
|
||
COMP 2222
|
||
Yes
|
||
COMP 3333
|
||
No
|
||
COMP 4444
|
||
No
|
||
COMP 5555
|
||
Yes', 14, 16, NULL::jsonb, NULL, NULL, 'Problem 6 [14 points] Perceptron
|
||
Suppose you are a CSE professor at HKUST who wants to offer a new course, COMP 2211.
|
||
Before starting, you want to predict if COMP 2211 will be successful or not. You approach
|
||
two students, A and B, asking them to read the proposal of 5 existing courses, COMP 1111,
|
||
COMP 2222, COMP 3333, COMP 4444, and COMP 5555, and rate them on a scale of 1
|
||
to 5 (assume only integer scores). Assume each student reads the proposals independently
|
||
and announces their rating. As each student might be biased, you may be unable to average
|
||
their ratings. Instead, you decide to use a perceptron to classify the data using the training
|
||
dataset in Table 1.
|
||
Course Code
|
||
x1
|
||
x2
|
||
Success
|
||
COMP 1111
|
||
No
|
||
COMP 2222
|
||
Yes
|
||
COMP 3333
|
||
No
|
||
COMP 4444
|
||
Yes
|
||
COMP 5555
|
||
Yes
|
||
Table 1: Training dataset
|
||
The training dataset is composed of the two student scores, x1 = score given by student A,
|
||
and x2 = score given by student B, and the corresponding true label regarding the success
|
||
of the courses judged based on the SFQ, for the 5 courses.
|
||
(a) [8 points] Train the perceptron to generate O = 1 if the course is success, O = 0 otherwise,
|
||
using the initial weight of w1 = 0, w2 = 0, θ = -1, and the activation function:
|
||
f(z) =
|
||
|
||
|
||
|
||
if z ≤0
|
||
otherwise
|
||
Also, use a learning rate of η = 1.
|
||
You may find the updating rules below useful.
|
||
∆wi = η(T −O)xi
|
||
∆θ = η(T −O)
|
||
wi = wi + ∆wi
|
||
θ = θ + ∆θ
|
||
Present each row of Table 1 as a training example, and update the perceptron weights
|
||
and bias before moving on to the next row. Show all the steps by completing Table 2.
|
||
x1
|
||
x2
|
||
T
|
||
O
|
||
∆w1
|
||
w1
|
||
∆w2
|
||
w2
|
||
∆θ
|
||
θ
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-1
|
||
-1
|
||
-4
|
||
-1
|
||
-5
|
||
-3
|
||
-1
|
||
-1
|
||
Table 2: Perceptron algorithm execution
|
||
Marking scheme:
|
||
0.2 point for each correct value. 40 values, 8 points in total
|
||
(b)
|
||
[6 points] Now, assume you do not care about the true label regarding the success of
|
||
the courses judged based on the SFQ. State whether each of the following scenarios for
|
||
which a perceptron using the ratings of the two students can correctly classify the data.
|
||
Justify your answer.
|
||
(i)
|
||
[3 points] If the total of their ratings is more than 8, then the course will be success
|
||
and otherwise it will fail, i.e.,
|
||
Course Code
|
||
x1
|
||
x2
|
||
Success
|
||
COMP 1111
|
||
No
|
||
COMP 2222
|
||
No
|
||
COMP 3333
|
||
Yes
|
||
COMP 4444
|
||
No
|
||
COMP 5555
|
||
No
|
||
Answer:
|
||
Perceptron can correctly classify the data in this scenario, because the data are
|
||
linearly separable.
|
||
Marking scheme:
|
||
1 point for stating the perceptron can correctly classify the data
|
||
2 points for the explanation
|
||
*** 2 points for the explanation part if correct weights and bias are given. ***
|
||
(ii)
|
||
[3 points] The course will succeed if and only if each reviewer gives either a rating
|
||
of 2 or a rating of 3, i.e.,
|
||
Course Code
|
||
x1
|
||
x2
|
||
Success
|
||
COMP 1111
|
||
No
|
||
COMP 2222
|
||
Yes
|
||
COMP 3333
|
||
No
|
||
COMP 4444
|
||
No
|
||
COMP 5555
|
||
Yes
|
||
Answer:
|
||
Perceptron cannot correctly classify the data in this scenario, because the data are
|
||
not linearly separable.
|
||
Marking scheme:
|
||
1 point for stating the perceptron cannot correctly classify the data
|
||
2 points for the explanation', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-fall-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [8 points] Multilayer Perceptron
|
||
Given the following training dataset with 4 examples in which each example has 3 binary
|
||
input attributes, x1, x2, x3 and 1 binary output T.
|
||
x1
|
||
x2
|
||
x3
|
||
T
|
||
Suppose we are learning a Multilayer Perceptron with the training dataset above; the neural
|
||
network has 1 hidden layer of 2 neurons, answer the following questions.
|
||
(a)
|
||
[2 points] How many weights and how many biases are there in the neural network?
|
||
(b)
|
||
[6 points] Draw the neural network below.
|
||
-------------------- END OF PAPER
|
||
--------------------
|
||
/* Rough work */
|
||
/* Rough work */', 8, 18, NULL::jsonb, NULL, NULL, 'Problem 7 [8 points] Multilayer Perceptron
|
||
Given the following training dataset with 4 examples in which each example has 3 binary
|
||
input attributes, x1, x2, x3 and 1 binary output T.
|
||
x1
|
||
x2
|
||
x3
|
||
T
|
||
Suppose we are learning a Multilayer Perceptron with the training dataset above; the neural
|
||
network has 1 hidden layer of 2 neurons, answer the following questions.
|
||
(a)
|
||
[2 points] How many weights and how many biases are there in the neural network?
|
||
Answer:
|
||
8 weights and 3 biases
|
||
Marking scheme:
|
||
1 point for the number of weights
|
||
1 point for the number of biases
|
||
*** 1 point if students only put 11 as the sum of these variables (e.g., 8 + 3 = 11) and
|
||
do not indicate 8 for weights and 3 for biases. ***
|
||
(b)
|
||
[6 points] Draw the neural network below.
|
||
Answer:
|
||
Marking scheme:
|
||
3 points for indicating 3 input nodes, 2 hidden neurons, and 1 output neuron.
|
||
-0.5 point for losing 1 node/neuron.
|
||
1 point for indicating 8 weights.
|
||
-0.5 point for losing 1 weight indicator, at most -1.
|
||
1 point for indicating 3 biases.
|
||
-0.5 point for losing 1 bias indicator, at most -1.
|
||
1 point for lines that connected layers.
|
||
-9,5 point for losing 1 connecting line, at most -1.
|
||
*** Additionally, suppose the number of nodes/neurons, the number of weights, the
|
||
number of biases, and the number of lines are more than expected. In that case, we will
|
||
only consider those correct ones, and the layer containing additional nodes/neurons will
|
||
be regarded as wrong. (e.g. if you put 3-¿4-¿1 as network, then only the input nodes
|
||
and output neuron consider to be correct, while all lines, weights, and biases should be
|
||
wrong, you can get 2 points.) ***
|
||
-------------------- END OF PAPER
|
||
--------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'easy', '', '', ''),
|
||
('COMP2211-2022-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [15 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 1.5 points for each correct answer.
|
||
(a) Machine learning is a sub-field of artificial intelligence.
|
||
(b) Tensorflow is easy to use and less flexible than Keras.
|
||
(c) Suppose we wish to calculate P(B|e1, e2) and we have no conditional independence in-
|
||
formation. Having P(e1, e2), P(B), P(e1, e2|B) are sufficient for the calculation.
|
||
(d) Training a K-nearest neighbors classifier takes more computational time than applying
|
||
it.
|
||
(e) K-nearest neighbors cannot be used for regression.
|
||
(f) Consider D-fold cross-validation. A higher number of folds (i.e. larger value of D), the
|
||
estimated error will be lower on average.
|
||
(g) K-means clustering algorithm is guaranteed to converge.
|
||
(h) Consider a two-class classification problem. Suppose we have trained a perceptron model
|
||
on a linearly separable training set, and now we get a new labeled data point which is
|
||
correctly classified by the model, and far away from the decision boundary. If we add
|
||
this new point to our earlier training set and re-train with the same set of initial weights
|
||
and bias, the learnt decision boundary will be changed for sure.
|
||
(i) Gradient descent is usually NOT guaranteed to converge at global minimum.
|
||
(j) If the learning rate is too small, gradient descent may take a very long time to converge
|
||
and computationally expensive.
|
||
Question
|
||
Answer (T/F)
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)', 15, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [15 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 1.5 points for each correct answer.
|
||
(a) Machine learning is a sub-field of artificial intelligence.
|
||
(b) Tensorflow is easy to use and less flexible than Keras.
|
||
(c) Suppose we wish to calculate P(B|e1, e2) and we have no conditional independence in-
|
||
formation. Having P(e1, e2), P(B), P(e1, e2|B) are sufficient for the calculation.
|
||
(d) Training a K-nearest neighbors classifier takes more computational time than applying
|
||
it.
|
||
(e) K-nearest neighbors cannot be used for regression.
|
||
(f) Consider D-fold cross-validation. A higher number of folds (i.e. larger value of D), the
|
||
estimated error will be lower on average.
|
||
(g) K-means clustering algorithm is guaranteed to converge.
|
||
(h) Consider a two-class classification problem. Suppose we have trained a perceptron model
|
||
on a linearly separable training set, and now we get a new labeled data point which is
|
||
correctly classified by the model, and far away from the decision boundary. If we add
|
||
this new point to our earlier training set and re-train with the same set of initial weights
|
||
and bias, the learnt decision boundary will be changed for sure.
|
||
(i) Gradient descent is usually NOT guaranteed to converge at global minimum.
|
||
(j) If the learning rate is too small, gradient descent may take a very long time to converge
|
||
and computationally expensive.
|
||
Question
|
||
Answer (T/F)
|
||
(a)
|
||
T
|
||
(b)
|
||
F
|
||
(c)
|
||
T
|
||
(d)
|
||
F
|
||
(e)
|
||
F
|
||
(f)
|
||
T
|
||
(g)
|
||
T
|
||
(h)
|
||
F
|
||
(i)
|
||
T
|
||
(j)
|
||
T
|
||
Marking scheme:
|
||
1.5 points for each correct answer.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [16 points] Python Fundamentals
|
||
For each of the Python expressions below, write the output when the expression is evaluated.
|
||
If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(a)
|
||
[5 points] Consider the following NumPy arrays:
|
||
import numpy as np
|
||
A = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
|
||
B = np.array([ [0, 1, 2, 3],
|
||
[4, 5, 6, 7],
|
||
[8, 9, 10, 11],
|
||
[12, 13, 14, 15] ])
|
||
(i) print(A[2:6:3])
|
||
(ii) print(B[0:2, 1:3])
|
||
(iii) print(A[-1:-3])
|
||
(iv) print(B[::-1])
|
||
(v) print(B[:3,2:])
|
||
(b)
|
||
[2 points] What is the output of the following code?
|
||
import numpy as np
|
||
X = np.array([2,2,0,2,2,0,1,1,0,1,1,0])
|
||
Y = X[X != 0]
|
||
print(Y[::2])
|
||
(c)
|
||
[9 points] Given the following code which computes the distance between each training
|
||
point in X train and each test point in X test using a nested loop over both the training
|
||
data and test data.
|
||
import numpy as np
|
||
def compute_distances_nested_loops(X_train, X_test):
|
||
num_test = X_test.shape[0]
|
||
num_train = X_train.shape[0]
|
||
distances = np.zeros((num_test, num_train))
|
||
# --- BLOCK TO REWRITE ---
|
||
for i in range(num_test):
|
||
for j in range(num_train):
|
||
distances[i,j] = np.sqrt(np.sum(np.square(X_test[i,:]-X_train[j,:])))
|
||
# --- BLOCK TO REWRITE ---
|
||
return distances
|
||
Train = np.array([[0,1], [1,2], [2,3], [3,4]])
|
||
Test = np.array([[5,6], [7,8]])
|
||
print(compute_distances_nested_loops(Train, Test))
|
||
# Output:
|
||
# [[7.07106781 5.65685425 4.24264069 2.82842712]
|
||
#
|
||
[9.89949494 8.48528137 7.07106781 5.65685425]]
|
||
Rewrite the block of code between the comment lines # --- BLOCK TO REWRITE ---
|
||
using no explicit loops in the space provided.
|
||
Hints:
|
||
To compute the distance between training data point (0, 1) and test data point (5, 6),
|
||
we do
|
||
dist = (0 −5)2 + (1 −6)2
|
||
Expand it
|
||
dist = 02 −2(0)(5) + 52 + 12 −2(1)(6) + 62
|
||
= 02 + 12 + 52 + 62 −2(0)(5) −2(1)(6)
|
||
= 02 + 12 + 52 + 62 −2((0)(5) + (1)(6))
|
||
You may find the following functions useful for this question.
|
||
Dot product of two arrays:
|
||
numpy.dot(a, b)
|
||
– It returns the product of matrix multiplication.
|
||
Return the element-wise square of the input
|
||
numpy.square(x)
|
||
– x is the input data
|
||
Return the sum of array elements over a given axis.
|
||
numpy.sum(a, axis=None)
|
||
– a is an array with elements to sum.
|
||
– axis: None or int or tuple of ints. axis = 0 means along the column and axis =
|
||
1 means working along ther row.
|
||
Return the non-negative square-root of an array, element-wise.
|
||
numpy.sqrt(x)
|
||
– x is the array with values whose square-roots are required.
|
||
Return a matrix (or a 2D array) from an 1D array.
|
||
numpy.matrix(data)
|
||
– data is the 1D array.
|
||
– Example: numpy.matrix([1, 2, 3]), output is [[1, 2, 3]]
|
||
Insert a new axis that will appear at the axis position in the expanded array shape.
|
||
numpy.expand_dims(a,axis)
|
||
– a is the input array.
|
||
– axis is an int or tuple of ints that represents poisition in the expanded axes where
|
||
the new axis (or axes) is placed.', 16, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [16 points] Python Fundamentals
|
||
For each of the Python expressions below, write the output when the expression is evaluated.
|
||
If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(a)
|
||
[5 points] Consider the following NumPy arrays:
|
||
import numpy as np
|
||
A = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
|
||
B = np.array([ [0, 1, 2, 3],
|
||
[4, 5, 6, 7],
|
||
[8, 9, 10, 11],
|
||
[12, 13, 14, 15] ])
|
||
(i) print(A[2:6:3])
|
||
Answer: [30 60]
|
||
# 1 point
|
||
(ii) print(B[0:2, 1:3])
|
||
Answer:
|
||
[[1 2]
|
||
[5 6]]
|
||
# 1 point
|
||
(iii) print(A[-1:-3])
|
||
Answer:
|
||
Empty Array
|
||
# 1 point
|
||
(iv) print(B[::-1])
|
||
Answer:
|
||
[[12 13 14 15]
|
||
# 1 point
|
||
[ 8
|
||
9 10 11]
|
||
[ 4
|
||
7]
|
||
[ 0
|
||
3]]
|
||
(v) print(B[:3,2:])
|
||
Answer:
|
||
[[ 2
|
||
3]
|
||
# 1 point
|
||
[ 6
|
||
7]
|
||
[10 11]]
|
||
Remark:
|
||
0.5 point will be deducted if the answers are in raw number format without [](i.e.
|
||
1 2 5 6). Because in this case, the answer is no longer in array type, and we can-
|
||
not distinguish whether the answer is a 2d array or 1d array. This deduction only
|
||
performs once for a(i) and a(ii).
|
||
(b)
|
||
[2 points] What is the output of the following code?
|
||
import numpy as np
|
||
X = np.array([2,2,0,2,2,0,1,1,0,1,1,0])
|
||
Y = X[X != 0]
|
||
print(Y[::2])
|
||
Answer:
|
||
[2 2 1 1]
|
||
# 2 points
|
||
Remark:
|
||
0.5 point will be deducted if the answers are in raw number format without [](i.e.
|
||
1 2 5 6). Because in this case, the answer is no longer in array type, and we can-
|
||
not distinguish whether the answer is a 2d array or 1d array. This deduction only
|
||
performs once for for a(i) and a(ii).
|
||
(c)
|
||
[9 points] Given the following code which computes the distance between each training
|
||
point in X train and each test point in X test using a nested loop over both the training
|
||
data and test data.
|
||
import numpy as np
|
||
def compute_distances_nested_loops(X_train, X_test):
|
||
num_test = X_test.shape[0]
|
||
num_train = X_train.shape[0]
|
||
distances = np.zeros((num_test, num_train))
|
||
# --- BLOCK TO REWRITE ---
|
||
for i in range(num_test):
|
||
for j in range(num_train):
|
||
distances[i,j] = np.sqrt(np.sum(np.square(X_test[i,:]-X_train[j,:])))
|
||
# --- BLOCK TO REWRITE ---
|
||
return distances
|
||
Train = np.array([[0,1], [1,2], [2,3], [3,4]])
|
||
Test = np.array([[5,6], [7,8]])
|
||
print(compute_distances_nested_loops(Train, Test))
|
||
# Output:
|
||
# [[7.07106781 5.65685425 4.24264069 2.82842712]
|
||
#
|
||
[9.89949494 8.48528137 7.07106781 5.65685425]]
|
||
Rewrite the block of code between the comment lines # --- BLOCK TO REWRITE ---
|
||
using no explicit loops in the space provided.
|
||
Hints:
|
||
To compute the distance between training data point (0, 1) and test data point (5, 6),
|
||
we do
|
||
dist = (0 −5)2 + (1 −6)2
|
||
Expand it
|
||
dist = 02 −2(0)(5) + 52 + 12 −2(1)(6) + 62
|
||
= 02 + 12 + 52 + 62 −2(0)(5) −2(1)(6)
|
||
= 02 + 12 + 52 + 62 −2((0)(5) + (1)(6))
|
||
You may find the following functions useful for this question.
|
||
Dot product of two arrays:
|
||
numpy.dot(a, b)
|
||
– It returns the product of matrix multiplication.
|
||
Return the element-wise square of the input
|
||
numpy.square(x)
|
||
– x is the input data
|
||
Return the sum of array elements over a given axis.
|
||
numpy.sum(a, axis=None)
|
||
– a is an array with elements to sum.
|
||
– axis: None or int or tuple of ints. axis = 0 means along the column and axis =
|
||
1 means working along ther row.
|
||
Return the non-negative square-root of an array, element-wise.
|
||
numpy.sqrt(x)
|
||
– x is the array with values whose square-roots are required.
|
||
Return a matrix (or a 2D array) from an 1D array.
|
||
numpy.matrix(data)
|
||
– data is the 1D array.
|
||
– Example: numpy.matrix([1, 2, 3]), output is [[1, 2, 3]]
|
||
Insert a new axis that will appear at the axis position in the expanded array shape.
|
||
numpy.expand_dims(a,axis)
|
||
– a is the input array.
|
||
– axis is an int or tuple of ints that represents poisition in the expanded axes where
|
||
the new axis (or axes) is placed.
|
||
Answer:
|
||
M = np.dot(X_test, X_train.T)
|
||
te = np.square(X_test).sum(axis=1)
|
||
tr = np.square(X_train).sum(axis=1)
|
||
dists = np.sqrt(-2*M + np.matrix(tr) + np.matrix(te).T)
|
||
Alternative answer:
|
||
distances = np.sqrt(np.sum(np.square(np.expand_dims(X_test, 1) - X_train), axis=2))
|
||
distances = np.sqrt(np.sum(np.square(X_test[:,None] - X_train[None]), axis=-1))
|
||
Marking scheme:
|
||
Case 0: Any for loop, list comprehension. # 0 point
|
||
Case 1:
|
||
(a) M = np.dot(X_test, X_train.T) # 2 points
|
||
1 point if missing transpose
|
||
(b) te = np.square(X_test).sum(axis=1) # 2 points
|
||
tr = np.square(X_train).sum(axis=1) # 2 points
|
||
– each square # 1 point
|
||
– each sum with right axis # 1 point
|
||
(c) dists = np.sqrt(-2*M + np.matrix(tr) + np.matrix(te).T) # 3 points
|
||
– 1 point if shape doesn’t match.
|
||
– 2 points if np.matrix is missing, or attempt to modify the shape but wrong
|
||
– if np.sqrt is missing,
|
||
* if previous part isn’t calculated correctly, 0 point for this part.
|
||
* -1 point otherwise.
|
||
Case 2:
|
||
distances = np.sqrt(np.sum(np.square(np.expand_dims(X_test, 1) - X_train), axis=2))
|
||
# 4 points
|
||
(a) 4 points for the subtraction
|
||
– 2 points if the shape modification is wrong.
|
||
– 0 points if no shape modification at all.
|
||
(b) 2 points for square
|
||
(c) 2 points for sum;
|
||
1 point if axis is missing or wrong
|
||
(d) 1 point for sqrt (unlike Case 1, sqrt is worth 1 point here.)
|
||
(e) -3 points if a2–b2 is used.
|
||
Remarks:
|
||
(a) If reshaped is used to change the shape of the solution, -1 point.
|
||
(b) If the program assume there’s 4 train points or 2 test points, and the number of
|
||
dimension is 2, -1 point or 0 point for the whole question.
|
||
(c) If Train, Test used instead of Xtrain, Xtest, -1 point. If your program are terribly
|
||
wrong (< 3 points) anyway, this mark wouldn’t be deducted.
|
||
(d) If extra code which leads to wrong answer, -1 point.
|
||
(e) If the output shape is (numtrain, numtest) instead of (numtest, numtrain), -1 point.
|
||
(f) If expand dims/reshape used without reassigning the variable, i.e. a = a.reshape(*b.shape),
|
||
-1 point.
|
||
(g) If missing shape, (e.g. Xtrain[0] instead of Xtrain.shape[0]), – 0.5 point.
|
||
(h) Wrong spelling and syntax will not resulted in mark deduction, but 0.5 point will be
|
||
deducted if you got full points.
|
||
(i) Index using i j assuming the existence of for loop gets 0 point.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2022-spring-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [11 points] Conditional Probability and Bayes Classifier
|
||
(a)
|
||
[2 points] Given the following probabilities:
|
||
P(Good course | Desmond is in the course) = 0.5
|
||
P(Good course | Desmond is not in the course) = 0.3
|
||
P(Desmond in a randomly chosen course) = 0.1
|
||
What is P(Desmond is in the course | Not a good course)? If your answer is not an
|
||
integer, write your answer in fraction form (use / to separate your numerator and de-
|
||
nominator), e.g. 1/2.
|
||
(b)
|
||
[7 points] Suppose you are given the following set of data with three Boolean input
|
||
variables A, B, C, and a single Boolean output variable D.
|
||
A
|
||
B
|
||
C
|
||
D
|
||
(i) According to the Naive Bayes classifier, what is P(D = 1|A = 1, B = 1, C = 0)? If
|
||
your answer is not an integer, write your answer in fraction form (use / to separate
|
||
your numerator and denominator), e.g. 1/2.
|
||
(ii) According to the Naive Bayes classifier, what is P(D = 0|A = 1, B = 1)? If your
|
||
answer is not an integer, write your answer in fraction form (use / to separate your
|
||
numerator and denominator), e.g. 1/2.
|
||
(iii) According to the general Bayes classifier (i.e. without independence assumption),
|
||
what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your
|
||
answer in fraction form (use / to separate your numerator and denominator), e.g.
|
||
1/2.
|
||
(iv) According to the general Bayes classifier (i.e. without independence assumption),
|
||
what is P(D = 1|A = 1, B = 1)? If your answer is not an integer, write your answer
|
||
in fraction form (use / to separate your numerator and denominator), e.g. 1/2.
|
||
(c)
|
||
[2 points] The Naive Bayes algorithm selects the class c for an example x that maxi-
|
||
mizes P(c|x). Suppose one of your classmates stated that it is equivalent to selecting the
|
||
c that maximizes P(x|c) under an assumption. What is the assumption that he has made?', 11, 7, NULL::jsonb, NULL, NULL, 'Problem 3 [11 points] Conditional Probability and Bayes Classifier
|
||
(a)
|
||
[2 points] Given the following probabilities:
|
||
P(Good course | Desmond is in the course) = 0.5
|
||
P(Good course | Desmond is not in the course) = 0.3
|
||
P(Desmond in a randomly chosen course) = 0.1
|
||
What is P(Desmond is in the course | Not a good course)? If your answer is not an
|
||
integer, write your answer in fraction form (use / to separate your numerator and de-
|
||
nominator), e.g. 1/2.
|
||
Answer:
|
||
Let D be Desmond is in the course, G be a good course
|
||
P(D|NOT G) = P(D, NOT G)
|
||
P(Not G)
|
||
=
|
||
P(Not G|D)P(D)
|
||
P(Not G|D)P(D) + P(Not G|Not D)P(Not D)
|
||
=
|
||
(1 −0.5) × 0.1
|
||
(1 −0.5) × 0.1 + (1 −0.3) × (1 −0.1)
|
||
= 5
|
||
Marking scheme:
|
||
2 points for giving the correct answer.
|
||
(b)
|
||
[7 points] Suppose you are given the following set of data with three Boolean input
|
||
variables A, B, C, and a single Boolean output variable D.
|
||
A
|
||
B
|
||
C
|
||
D
|
||
(i) According to the Naive Bayes classifier, what is P(D = 1|A = 1, B = 1, C = 0)? If
|
||
your answer is not an integer, write your answer in fraction form (use / to separate
|
||
your numerator and denominator), e.g. 1/2.
|
||
Answer:
|
||
P(D = 1|A = 1, B = 1, C = 0) =
|
||
P(D = 1)P(A = 1|D = 1)P(B = 1|D = 1)P(C = 0|D = 1)
|
||
P1
|
||
j=0 P(D = j)P(A = 1|D = j)P(B = 1|D = j)P(C = 0|D = j)
|
||
=
|
||
(4/8)(2/4)(1/4)(2/4)
|
||
(4/8)(2/4)(2/4)(1/4) + (4/8)(2/4)(1/4)(2/4) = 1
|
||
Marking scheme:
|
||
1.75 points for giving the correct answer.
|
||
(ii) According to the Naive Bayes classifier, what is P(D = 0|A = 1, B = 1)? If your
|
||
answer is not an integer, write your answer in fraction form (use / to separate your
|
||
numerator and denominator), e.g. 1/2.
|
||
Answer:
|
||
P(D = 0|A = 1, B = 1) =
|
||
P(D = 0)P(A = 1|D = 0)P(B = 1|D = 0)
|
||
P1
|
||
j=0 P(D = j)P(A = 1|D = j)P(B = 1|D = j)
|
||
=
|
||
(4/8)(2/4)(2/4)
|
||
(4/8)(2/4)(2/4) + (4/8)(2/4)(1/4) = 2
|
||
Marking scheme:
|
||
1.75 points for giving the correct answer.
|
||
(iii) According to the general Bayes classifier (i.e. without independence assumption),
|
||
what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your
|
||
answer in fraction form (use / to separate your numerator and denominator), e.g.
|
||
1/2.
|
||
Answer:
|
||
P(D = 1|A = 1, B = 1, C = 0) = 0 as there is no data with A = 1, B = 1, C
|
||
= 0 and D = 1 in the dataset.
|
||
Marking scheme:
|
||
1.75 points for giving the correct answer.
|
||
(iv) According to the general Bayes classifier (i.e. without independence assumption),
|
||
what is P(D = 1|A = 1, B = 1)? If your answer is not an integer, write your answer
|
||
in fraction form (use / to separate your numerator and denominator), e.g. 1/2.
|
||
Answer:
|
||
P(D = 1|A = 1, B = 1) = 1/2 as number of data with A = 1, B = 1, D = 1
|
||
is 1, and number of data with A = 1, B = 1 is 2.
|
||
Marking scheme:
|
||
1.75 points for giving the correct answer.
|
||
(c)
|
||
[2 points] The Naive Bayes algorithm selects the class c for an example x that maxi-
|
||
mizes P(c|x). Suppose one of your classmates stated that it is equivalent to selecting the
|
||
c that maximizes P(x|c) under an assumption. What is the assumption that he has made?
|
||
Answer:
|
||
P(c|x) = P(x|c)P(c)
|
||
P(x)
|
||
, so finding the c that maximizes P(c|x) is equivalent to finding c
|
||
that maximizes P(x|c), if the prior P(c) is uniform.
|
||
Marking scheme:
|
||
2 points for stating the assumption correctly.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors
|
||
(a) [3 points] Consider a set of 5 training data given as ((xTrain
|
||
, xTrain
|
||
), cTrain) values, where
|
||
xTrain
|
||
and xTrain
|
||
are the two attribute values (positive integers) and cTrain is the binary
|
||
class label, A or B:
|
||
{ ((6,7),A), ((4,8),B), ((6,5),B), ((7,9),A), ((2,4),A) }
|
||
Classify a test example (xTest
|
||
, xTest
|
||
) with attribute values (4,7) using a KNN classifier
|
||
with K = 3 and Manhattan distance defined by
|
||
distance(xTrain, xTest) = P2
|
||
i=1 |xTrain
|
||
i
|
||
−xTest
|
||
i
|
||
|
|
||
where | · | denote absolute value.
|
||
Complete the following table by filling in the computed Manhattan distance between
|
||
each training data point and the test example. Determine the class label based on the
|
||
results.
|
||
xTrain
|
||
xTrain
|
||
cTrain
|
||
Distance
|
||
A
|
||
B
|
||
B
|
||
A
|
||
A
|
||
(b) [6 points] Judge whether each of the following student’s claims is correct or not. Explain
|
||
why.
|
||
(i)
|
||
[3 points] A student claims that the results of a general KNN classifier that uses
|
||
Euclidean distance will change if we multiply all attribute values of each training
|
||
and test data point by 0.5.
|
||
(ii) [3 points] Another student claims that the classification accuracy of the training set
|
||
will always increase if the value of K used in KNN classifier is incrementally increased
|
||
from 1 to N, where N is the total number of training examples.
|
||
(c) [5 points] Consider KNN using Euclidean distance on the following data set. Each point
|
||
belongs to one of the two classes: + and x.
|
||
(i)
|
||
[2 points] Perform 10-fold cross validation on the given data set (i.e. the 10 data
|
||
points as shown in the figure), what is the validation error when using 1-nearest
|
||
neighbor?
|
||
(ii) [3 points] Which of the following values of K leads to the minimum 10-fold cross vali-
|
||
dation error: 3, 5 or 9? What is the error for that K? If there is a tie, please elaborate.', 14, 9, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors
|
||
(a) [3 points] Consider a set of 5 training data given as ((xTrain
|
||
, xTrain
|
||
), cTrain) values, where
|
||
xTrain
|
||
and xTrain
|
||
are the two attribute values (positive integers) and cTrain is the binary
|
||
class label, A or B:
|
||
{ ((6,7),A), ((4,8),B), ((6,5),B), ((7,9),A), ((2,4),A) }
|
||
Classify a test example (xTest
|
||
, xTest
|
||
) with attribute values (4,7) using a KNN classifier
|
||
with K = 3 and Manhattan distance defined by
|
||
distance(xTrain, xTest) = P2
|
||
i=1 |xTrain
|
||
i
|
||
−xTest
|
||
i
|
||
|
|
||
where | · | denote absolute value.
|
||
Complete the following table by filling in the computed Manhattan distance between
|
||
each training data point and the test example. Determine the class label based on the
|
||
results.
|
||
xTrain
|
||
xTrain
|
||
cTrain
|
||
Distance
|
||
A
|
||
B
|
||
B
|
||
A
|
||
A
|
||
Answer:
|
||
xTrain
|
||
xTrain
|
||
cTrain
|
||
Distance
|
||
A
|
||
B
|
||
B
|
||
A
|
||
A
|
||
The predicted class label of the test example is B.
|
||
Marking scheme:
|
||
0.5 point for giving each correct distance value. 2.5 points in total.
|
||
0.5 point for giving the correct class label.
|
||
(b) [6 points] Judge whether each of the following student’s claims is correct or not. Explain
|
||
why.
|
||
(i)
|
||
[3 points] A student claims that the results of a general KNN classifier that uses
|
||
Euclidean distance will change if we multiply all attribute values of each training
|
||
and test data point by 0.5.
|
||
Answer:
|
||
The claim is false, because K nearest neighbors will remain unchanged after multi-
|
||
plying all attribute values of each training and test data points by 0.5.
|
||
Marking scheme:
|
||
1 point for stating the claim is false.
|
||
2 points for giving the correct explanation.
|
||
Remark:
|
||
1 point given if stating the claim is correct but did mention that the change on
|
||
distance will be a multiplication of constant to the old distance AND didn’t state
|
||
result of KNN classifier will change.
|
||
2 points given if stating the claim is correct but mention the change on distance
|
||
will be a multiplication and state the result wont change.
|
||
0 point if correct deduction but draw the conclusion that result will change.
|
||
1 point for explanation if treat the question as asking multiply the values on
|
||
Manhattan and answered correctly.
|
||
0 point for explanation if treat the question as asking comparison between Man-
|
||
hattan and euclidean distance.
|
||
(ii) [3 points] Another student claims that the classification accuracy of the training set
|
||
will always increase if the value of K used in KNN classifier is incrementally increased
|
||
from 1 to N, where N is the total number of training examples.
|
||
Answer:
|
||
The claim is false. A counterexample is as follows:
|
||
The training set accuracy when K = 1 will be 100%.
|
||
As K approaches the total number of training examples more and more examples
|
||
influence the class, and eventually the class will always be the majority class in the
|
||
training set.
|
||
Marking scheme:
|
||
1 point for stating the claim is false.
|
||
2 points for giving the correct explanation.
|
||
Remarks:
|
||
Give full mark if they misregard outliers as further neighbors.
|
||
-1 point if they mention is due to outliers but didn’t explain what is regarded as
|
||
outliers.
|
||
-2 points if they really mean outliers.
|
||
-1 point if they only state larger k will cover more neighbors (this is just explain-
|
||
ing what does a larger k means but not explaining why larger k can lower the
|
||
accuracy) (*further neighbors is accepted but more neighbors is not)
|
||
Other accept answers:
|
||
i. Larger k include further neighbors which have low relevancy.
|
||
ii. Underfitting
|
||
iii. Giving a concrete situation that the claim is wrong.
|
||
(c) [5 points] Consider KNN using Euclidean distance on the following data set. Each point
|
||
belongs to one of the two classes: + and x.
|
||
(i)
|
||
[2 points] Perform 10-fold cross validation on the given data set (i.e. the 10 data
|
||
points as shown in the figure), what is the validation error when using 1-nearest
|
||
neighbor?
|
||
Answer:
|
||
Every point is misclassified. So, the validation error is 10/10.
|
||
Marking scheme:
|
||
2 points for giving the correct validation error.
|
||
Note: Both 10/10 and 100% are accepted as the correct answers.
|
||
(ii) [3 points] Which of the following values of K leads to the minimum 10-fold cross vali-
|
||
dation error: 3, 5 or 9? What is the error for that K? If there is a tie, please elaborate.
|
||
Answer:
|
||
All 3 values of K mis-classify all of the points and have the same classification errors,
|
||
10/10.
|
||
Marking scheme:
|
||
2 points for stating all 3 values of K have the same classification errors.
|
||
1 point for giving the correct error.
|
||
Note: Both 10/10 and 100% are accepted as the correct answers.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [12 points] K-Means Clustering
|
||
Given a 1-dimensional data set {0, 3, 6, 9, 27, 30}, use the K-means clustering algorithm and
|
||
Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3
|
||
and c2 = 4 are chosen as the initial cluster centers.
|
||
(a) [4.5 points] Perform one iteration of K-means clustering by finding the Euclidean distance
|
||
between each data point in the data set and the centroids, and assign each data point to
|
||
the closest centroid according to the distance found. Fill in the following table with your
|
||
computed values. If your distance value is not an integer, write your answer in decimal
|
||
form, e.g. 1.234.
|
||
Data Point
|
||
Distance between the
|
||
data point and c1 = 3
|
||
Distance between the
|
||
data point and c2 = 4
|
||
Closest Centroid
|
||
(c1 or c2?)
|
||
(b)
|
||
[1.5 points] What are the values of c1 and c2 after one iteration of K-means? If your
|
||
answer is not an integer, write your answer in decimal form, e.g. 1.234.
|
||
(c) [4.5 points] Perform the second iteration of K-means clustering by finding the Euclidean
|
||
distance between each data point in the data set and the computed centroids in part (b),
|
||
and assign each data point to the closest centroid according to the distance found. Fill in
|
||
the following table with your computed values. If your distance value is not an integer,
|
||
write your answer in decimal form, e.g. 1.234.
|
||
Data Point
|
||
Distance between the
|
||
data point and the c1
|
||
computed in part (b)
|
||
Distance between the
|
||
data point and the c2
|
||
computed in part (b)
|
||
Closest Centroid
|
||
(c1 or c2?)
|
||
(d)
|
||
[1.5 points] What are the values of c1 and c2 after the second iteration of K-means? If
|
||
your answer is not an integer, write your answer in decimal form, e.g. 1.234.', 12, 11, NULL::jsonb, NULL, NULL, 'Problem 5 [12 points] K-Means Clustering
|
||
Given a 1-dimensional data set {0, 3, 6, 9, 27, 30}, use the K-means clustering algorithm and
|
||
Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3
|
||
and c2 = 4 are chosen as the initial cluster centers.
|
||
(a) [4.5 points] Perform one iteration of K-means clustering by finding the Euclidean distance
|
||
between each data point in the data set and the centroids, and assign each data point to
|
||
the closest centroid according to the distance found. Fill in the following table with your
|
||
computed values. If your answer is not an integer, write your answer in decimal form,
|
||
e.g. 1.234.
|
||
Data Point
|
||
Distance between the
|
||
data point and c1 = 3
|
||
Distance between the
|
||
data point and c2 = 4
|
||
Closest Centroid
|
||
(c1 or c2?)
|
||
Answer:
|
||
Data Point
|
||
Distance between the
|
||
data point and c1 = 3
|
||
Distance between the
|
||
data point and c2 = 4
|
||
Closest Centroid
|
||
(c1 or c2?)
|
||
c1
|
||
c1
|
||
c2
|
||
c2
|
||
c2
|
||
c2
|
||
Marking scheme:
|
||
0.25 for giving each correct value. 4.5 points in total.
|
||
(b)
|
||
[1.5 points] What are the values of c1 and c2 after one iteration of K-means? If your
|
||
answer is not an integer, write your answer in decimal form, e.g. 1.234.
|
||
Answer:
|
||
c1 = 0 + 3
|
||
= 1.5
|
||
c2 = 6 + 9 + 27 + 30
|
||
= 18
|
||
Marking scheme:
|
||
0.75 for giving each correct centroid. 1.5 points in total.
|
||
(c) [4.5 points] Perform the second iteration of K-means clustering by finding the Euclidean
|
||
distance between each data point in the data set and the computed centroids in part (b),
|
||
and assign each data point to the closest centroid according to the distance found. Fill
|
||
in the following table with your computed values. If your answer is not an integer, write
|
||
your answer in decimal form, e.g. 1.234.
|
||
Data Point
|
||
Distance between the
|
||
data point and the c1
|
||
computed in part (b)
|
||
Distance between the
|
||
data point and the c2
|
||
computed in part (b)
|
||
Closest Centroid
|
||
(c1 or c2?)
|
||
Answer:
|
||
Data Point
|
||
Distance between the
|
||
data point and the c1
|
||
computed in part (b)
|
||
Distance between the
|
||
data point and the c2
|
||
computed in part (b)
|
||
Closest Centroid
|
||
(c1 or c2?)
|
||
1.5
|
||
c1
|
||
1.5
|
||
c1
|
||
4.5
|
||
c1
|
||
7.5
|
||
c1
|
||
25.5
|
||
c2
|
||
28.5
|
||
c2
|
||
Marking scheme:
|
||
0.25 for giving each correct value. 4.5 points in total.
|
||
(d)
|
||
[1.5 points] What are the values of c1 and c2 after the second iteration of K-means? If
|
||
your answer is not an integer, write your answer in decimal form, e.g. 1.234.
|
||
Answer:
|
||
c1 = 0 + 3 + 6 + 9
|
||
= 4.5
|
||
c2 = 27 + 30
|
||
= 28.5
|
||
Marking scheme:
|
||
0.75 for giving each correct centroid. 1.5 points in total.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [20 points] Perceptron
|
||
Given the following training dataset:
|
||
x1
|
||
0.5
|
||
x2
|
||
0.5
|
||
T
|
||
-1
|
||
-1
|
||
-1
|
||
(a)
|
||
[8 points] Show the action of the perceptron algorithm for the above sequence of data
|
||
points by completing the following table. Assume η = 1 and we start with the following
|
||
initial weights and bias
|
||
w1 = 1
|
||
w2 = 1
|
||
θ = 0
|
||
and use the following activation function.
|
||
f(z) =
|
||
|
||
|
||
|
||
z ≥0
|
||
−1
|
||
otherwise
|
||
Updating rules:
|
||
∆wi = η(T −O)xi
|
||
∆θ = η(T −O)
|
||
wi = wi + ∆wi
|
||
θ = θ + ∆θ
|
||
where i ∈{1, 2}.
|
||
If your answer is not an integer, write your answer in decimal form, e.g. 1.234.
|
||
x1
|
||
x2
|
||
T
|
||
O
|
||
∆w1
|
||
w1
|
||
∆w2
|
||
w2
|
||
∆θ
|
||
θ
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
(b)
|
||
[2 points] According to the values in the table above, state whether the perceptron
|
||
algorithm is converged in 1 epoch. If not, explain why.
|
||
(c) [10 points] Write a Python program to verify your answers of Part (a). In your program,
|
||
you need to define the following variables.
|
||
A 2D NumPy array, X, to store all the attribute values x1, x2, where the shape is
|
||
(8,2)
|
||
A 1D NumPy array, T,to store the target values, where the shape is (8,)
|
||
A 1D NumPy array, W, to store the weights, where the shape is (2,)
|
||
A float bias value, b.
|
||
and perform the required computations. Also, print the following sequence of values (in
|
||
exact order) for each iteration:
|
||
x1 < s > x2 < s > T < s > O < s > ∆w1 < s > w1 < s > ∆w2 < s > w2 < s > ∆θ < s > θ < end >
|
||
where < s > refers to an empty space, and < end > refers to an end of line character.
|
||
The following shows a line of sample output.
|
||
0 0 0 0 0 0 0 0 0 0
|
||
Remark: You cannot use any other libraries other than NumPy in your program.', 20, 12, NULL::jsonb, NULL, NULL, 'Problem 6 [20 points] Perceptron
|
||
Given the following training dataset:
|
||
x1
|
||
0.5
|
||
x2
|
||
0.5
|
||
T
|
||
-1
|
||
-1
|
||
-1
|
||
(a)
|
||
[8 points] Show the action of the perceptron algorithm for the above sequence of data
|
||
points by completing the following table. Assume η = 1 and we start with the following
|
||
initial weights and bias
|
||
w1 = 1
|
||
w2 = 1
|
||
θ = 0
|
||
and use the following activation function.
|
||
f(z) =
|
||
|
||
|
||
|
||
z ≥0
|
||
−1
|
||
otherwise
|
||
Updating rules:
|
||
∆wi = η(T −O)xi
|
||
∆θ = η(T −O)
|
||
wi = wi + ∆wi
|
||
θ = θ + ∆θ
|
||
where i ∈{1, 2}.
|
||
If your answer is not an integer, write your answer in decimal form, e.g. 1.234.
|
||
x1
|
||
x2
|
||
T
|
||
O
|
||
∆w1
|
||
w1
|
||
∆w2
|
||
w2
|
||
∆θ
|
||
θ
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
Answer:
|
||
x1
|
||
x2
|
||
T
|
||
O
|
||
∆w1
|
||
w1
|
||
∆w2
|
||
w2
|
||
∆θ
|
||
θ
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-
|
||
-1
|
||
-2
|
||
-2
|
||
-2
|
||
-1
|
||
-6
|
||
-5
|
||
-6
|
||
-5
|
||
-2
|
||
-4
|
||
-1
|
||
-2
|
||
0.5
|
||
0.5
|
||
-1
|
||
-1
|
||
-1
|
||
-2
|
||
-4
|
||
-4
|
||
-4
|
||
Marking scheme:
|
||
0.1 point for giving each correct value. 8 points in total.
|
||
(b)
|
||
[2 points] According to the values in the table above, state whether the perceptron al-
|
||
gorithm is converged in 1 epoch. If not, explain why.
|
||
Answer:
|
||
The algorithm is not converged in 1 epoch.
|
||
Since there are changes of weights and
|
||
biases.
|
||
Marking scheme:
|
||
1 point for stating the algorithm is not converged.
|
||
1 point for giving a correct explanation.
|
||
(c) [10 points] Write a Python program to verify your answers of Part (a). In your program,
|
||
you need to define the following variables
|
||
A 2D NumPy array, X, to store all the attribute values x1, x2, where the shape is
|
||
(8,2)
|
||
A 1D NumPy array, T,to store the target values, where the shape is (8,)
|
||
A 1D NumPy array, W, to store the weights, where the shape is (2,)
|
||
A float bias value, b.
|
||
and perform the required computations. Also, print the following sequence of values (in
|
||
exact order) for each iteration:
|
||
x1 < s > x2 < s > T < s > O < s > ∆w1 < s > w1 < s > ∆w2 < s > w2 < s > ∆θ < s > θ < end >
|
||
where < s > refers to an empty space, and < end > refers to an end of line character.
|
||
The following shows a line of sample output.
|
||
0 0 0 0 0 0 0 0 0 0
|
||
Remark: You cannot use any other libraries other than NumPy in your program.
|
||
Answer:
|
||
import numpy as np
|
||
# 0.5 point
|
||
X = np.array([[10,10], [0,0], [8,4], [3,3], [4,8], [0.5,0.5], [4,3], [2,5]])
|
||
# 1 point
|
||
T = np.array([1,-1,1,-1,1,-1,1,1])
|
||
# 0.5 point
|
||
W = np.array([1,1], dtype=float)
|
||
# 1 point
|
||
b = 0.0
|
||
# 0.5 point
|
||
for i in range(X.shape[0]):
|
||
# 1 point
|
||
y = X[i].dot(W) + b
|
||
# 1 point
|
||
if y >= 0:
|
||
# 0.5 point
|
||
O = 1
|
||
# 0.5 point
|
||
else:
|
||
O = -1
|
||
# 0.5 point
|
||
DW = (T[i] - O) * X[i]
|
||
# 0.5 point
|
||
W += DW
|
||
# 0.5 point
|
||
Db = (T[i] - O)
|
||
# 0.5 point
|
||
b += Db
|
||
# 0.5 point
|
||
print(X[i][0], X[i][1], T[i], O, DW[0], W[0], DW[1], W[1], Db, b)
|
||
# 1 point
|
||
Remarks:
|
||
-0.25 point for forgetting to specify W = np.array([1,1], dtype=float)
|
||
Can also b = 0, Python will duck-type into float when needed.
|
||
-0.25 point each for not using NumPy array. No overlap with W float penalty, since
|
||
Python list supports duck-typing.
|
||
-0.25 point each for wrong array shape.
|
||
-0.5 point for hard-coding for i in range(8):.
|
||
-0.25 point for forgetting bias in the output calculation.
|
||
-0.25 point for minor print formatting errors.
|
||
-0.25 point for incorrect print when output == target.
|
||
-0.25 point each for miscellaneous syntax errors.
|
||
-1 flat point for defining as class/function(s) but not calling.
|
||
-10 floor point for using SKlearn.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2022-spring-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [12 points] Perceptron and Multilayer Perceptron
|
||
(a)
|
||
[4 points] Can we represent the given boolean function with a single neuron as shown
|
||
below?
|
||
A
|
||
B
|
||
f(A,B)
|
||
If yes, show the possible weights, bias and activation function that can be used to com-
|
||
pute the output of all the data points correctly. If not, explain why not in 1-2 sentences.
|
||
(b) [6 points] Suppose we have a neural network as shown below with θ1 = 0, θ2 = 0, θ3 = 0,
|
||
and linear activation function (i.e. f(x) = Cx, where C is a constant).
|
||
Can any function that is represented by the above network be represented by a single
|
||
unit of artificial neural network in the following diagram? If so, detail the weights (w7
|
||
and w8), bias (θ), and the activation function f(x). Otherwise, explain why not.
|
||
(c)
|
||
[2 points] Given a multilayer perceptron with 3 layers (input layer, hidden layer and
|
||
output layer). The number of units in each of the these layers are 3, 4, 2. Assume each
|
||
input or neuron is fully-connected. Calculate the number of trainable parameters of this
|
||
network.
|
||
-------------------- END OF PAPER
|
||
--------------------', 12, 14, NULL::jsonb, NULL, NULL, 'Problem 7 [12 points] Perceptron and Multilayer Perceptron
|
||
(a)
|
||
[4 points] Can we represent the given boolean function with a single neuron as shown
|
||
below?
|
||
A
|
||
B
|
||
f(A,B)
|
||
If yes, show the possible weights, bias and activation function that can be used to com-
|
||
pute the output of all the data points correctly. If not, explain why not in 1-2 sentences.
|
||
Answer:
|
||
Yes, we can represent this function with a single neuron, since it is linearly separable.
|
||
One set of possible weights and bias is: w1 = 1, w2 = −1, θ = −0.5, and the activation
|
||
function is
|
||
f(x) =
|
||
|
||
|
||
|
||
x > 0
|
||
otherwise
|
||
Marking scheme:
|
||
1 point for stating it is possible to represent the given boolean function with a single
|
||
neuron.
|
||
1 point for showing the “standard” activation function.
|
||
2 points for correct values of w1, w2, and θ, given the activation function is the
|
||
“standard” one.
|
||
If the given activation function is not “standard”, but the parameters are suitable,
|
||
also get 2 points.
|
||
0 point for stating not possible to represent.
|
||
(b) [6 points] Suppose we have a neural network as shown below with θ1 = 0, θ2 = 0, θ3 = 0,
|
||
and linear activation function (i.e. f(x) = Cx, where C is a constant).
|
||
Can any function that is represented by the above network be represented by a single
|
||
unit of artificial neural network in the following diagram? If so, detail the weights (w7
|
||
and w8), bias (θ), and the activation function f(x). Otherwise, explain why not.
|
||
Answer:
|
||
Yes, the network can be represented by a single unit of artificial neural network by
|
||
setting its weights, w7 = w1w5 + w3w6, w8 = w2w5 + w4w6, and the activation function
|
||
f(x) = C2.
|
||
Marking scheme:
|
||
0.5 point for stating any function that is represented by the MLP can be represented
|
||
by a single unit of artificial neural network.
|
||
1.5 points for showing each possible weight w7, w8. 3 points in total.
|
||
1 point for showing a possible bias value, i.e. 0.
|
||
1.5 for showing the activation function.
|
||
Remarks:
|
||
If stating NO, -6 points.
|
||
But among those NO’s, if students put the expression
|
||
OUTPUT=C2w5(x1w1 + x2w2 + θ1) + C2w6(x1w3 + x2w4 + θ2) + Cθ3, give 1 point.
|
||
For those stating YES, if missing some value, deduct the score of that value.
|
||
If the order of C is wrong, e.g. one C in w7w8 and no C in activation, or C2 in w7w8
|
||
and one C in activation, -1 point.
|
||
For other wrong values, -full mark for that values.
|
||
If didn’t plug in given values and the expression is wrong, even if the expression may
|
||
evaluate to the same value as the answer, -full mark for that expression or -1 point
|
||
if it’s an order-of-C issue.
|
||
If didn’t plug in given values, e.g. θs=0, the same C for all units (some use C1C2C3),
|
||
but otherwise correct, -0.5 for each of θ & C.
|
||
(c)
|
||
[2 points] Given a multilayer perceptron with 3 layers (input layer, hidden layer and
|
||
output layer). The number of units in each of the these layers are 3, 4, 2. Assume each
|
||
input or neuron is fully-connected. Calculate the number of trainable parameters of this
|
||
network.
|
||
Answer:
|
||
3 ∗4 + 4 + 4 ∗2 + 2 = 26.
|
||
Marking scheme:
|
||
2 points for giving the correct answer.
|
||
-------------------- END OF PAPER
|
||
--------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-a', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 1 point for each correct answer.
|
||
(a) One drawback of the K-means algorithm is that one needs to specify exactly how many
|
||
clusters the algorithm should find.
|
||
(b) Increasing the number of hidden layers always increases the model performance.
|
||
(c) When handling a binary classification task, both softmax and sigmoid functions can be
|
||
used as the activation function in the output layer.
|
||
(d) Validation accuracy must be lower than training accuracy.
|
||
(e) We cannot train/inference deep learning networks using CPU.
|
||
(f) Otsu’s thresholding method and affine transformations are point-based image operations.
|
||
(g) In the convolutional layer of a CNN, the number of weights depends on the depth of the
|
||
input volume and the number of biases is equal to the number of kernels.
|
||
(h) After training a neural network, you observe a large gap between the training accuracy
|
||
(100%) and the task accuracy (40%). Dropout is commonly used to reduce this gap.
|
||
(i) In a minimax-based 3×3 tic-tac-toe game, an AI player will definitely win because it
|
||
knows all possible moves of the game.
|
||
(j) The alpha-beta pruning algorithm is preferred to minimax because it computes the same
|
||
answer as minimax while usually doing so without examining as much of the game tree.
|
||
Question
|
||
Answer (T/F)
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)', 10, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 1 point for each correct answer.
|
||
(a) One drawback of the K-means algorithm is that one needs to specify exactly how many
|
||
clusters the algorithm should find.
|
||
(b) Increasing the number of hidden layers always increases the model performance.
|
||
(c) When handling a binary classification task, both softmax and sigmoid functions can be
|
||
used as the activation function in the output layer.
|
||
(d) Validation accuracy must be lower than training accuracy.
|
||
(e) We cannot train/inference deep learning networks using CPU.
|
||
(f) Otsu’s thresholding method and affine transformations are point-based image operations.
|
||
(g) In the convolutional layer of a CNN, the number of weights depends on the depth of the
|
||
input volume and the number of biases is equal to the number of kernels.
|
||
(h) After training a neural network, you observe a large gap between the training accuracy
|
||
(100%) and the task accuracy (40%). Dropout is commonly used to reduce this gap.
|
||
(i) In a minimax-based 3×3 tic-tac-toe game, an AI player will definitely win because it
|
||
knows all possible moves of the game.
|
||
(j) The alpha-beta pruning algorithm is preferred to minimax because it computes the same
|
||
answer as minimax while usually doing so without examining as much of the game tree.
|
||
Answer:
|
||
Question
|
||
Answer (T/F)
|
||
(a)
|
||
T
|
||
(b)
|
||
F
|
||
(c)
|
||
T
|
||
(d)
|
||
F
|
||
(e)
|
||
F
|
||
(f)
|
||
F
|
||
(g)
|
||
T
|
||
(h)
|
||
T
|
||
(i)
|
||
F
|
||
(j)
|
||
T
|
||
Marking scheme:
|
||
1 point for giving each correct answer. 10 points in total.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-a', '2', NULL, 2, 'long_question', 'long_answer', 'Problem 2 [14 points] Na¨ıve Bayes and K-Nearest Neighbors
|
||
Given the training data in the table below.
|
||
No.
|
||
CGPA
|
||
Interest in
|
||
Computing Subjects
|
||
Practice-oriented
|
||
Learner?
|
||
COMP 2211
|
||
Grade
|
||
Select COMP
|
||
as Major
|
||
≤3
|
||
High
|
||
No
|
||
B
|
||
No
|
||
≤3
|
||
High
|
||
No
|
||
A
|
||
No
|
||
> 3 AND ≤4
|
||
High
|
||
No
|
||
B
|
||
Yes
|
||
> 4
|
||
Medium
|
||
No
|
||
B
|
||
Yes
|
||
> 4
|
||
Low
|
||
Yes
|
||
B
|
||
Yes
|
||
> 4
|
||
Low
|
||
Yes
|
||
A
|
||
No
|
||
> 3 AND ≤4
|
||
Low
|
||
Yes
|
||
A
|
||
Yes
|
||
≤3
|
||
Medium
|
||
No
|
||
B
|
||
No
|
||
≤3
|
||
Low
|
||
Yes
|
||
B
|
||
Yes
|
||
> 4
|
||
Medium
|
||
Yes
|
||
B
|
||
Yes
|
||
≤3
|
||
Medium
|
||
Yes
|
||
A
|
||
Yes
|
||
> 3 AND ≤4
|
||
Medium
|
||
No
|
||
A
|
||
Yes
|
||
> 3 AND ≤4
|
||
High
|
||
Yes
|
||
B
|
||
Yes
|
||
> 4
|
||
Medium
|
||
No
|
||
A
|
||
No
|
||
(a)
|
||
[6.5 points] Given a new example with the following attribute values. Predict the value
|
||
of its “Select COMP as Major” using Na¨ıve Bayes classifier. Show all the steps.
|
||
CGPA ≤3
|
||
Interest in Computing Subjects = Medium
|
||
Practice-oriented Learner = Yes
|
||
COMP 2211 Grade = B
|
||
(b)
|
||
[7.5 points] Similar to the above, but this time, predict the value of its “Select COMP
|
||
as Major” using K-nearest neighbor for K = 5. Complete the following table and state
|
||
the prediction result based on the data in the completed table. For similarity measure,
|
||
use a simple match of attribute values:
|
||
S(ai, bi) =
|
||
X
|
||
i=1
|
||
wi ∗distance(ai, bi)
|
||
where distance(ai, bi) is 0 if ai equals bi, and 1 otherwise. ai and bi are either CGPA,
|
||
interest in computing subjects, practice-oriented learner or COMP 2211 grade. Weights,
|
||
wi, are all 1 except for interest in computing subjects, it is 2.
|
||
No.
|
||
Class
|
||
Distance to New Example
|
||
No
|
||
No
|
||
Yes
|
||
Yes
|
||
Yes
|
||
No
|
||
Yes
|
||
No
|
||
Yes
|
||
Yes
|
||
Yes
|
||
Yes
|
||
Yes
|
||
No', 14, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [14 points] Na¨ıve Bayes and K-Nearest Neighbors
|
||
Given the training data in the table below.
|
||
No.
|
||
CGPA
|
||
Interest in
|
||
Computing Subjects
|
||
Practice-oriented
|
||
Learner?
|
||
COMP 2211
|
||
Grade
|
||
Select COMP
|
||
as Major
|
||
≤3
|
||
High
|
||
No
|
||
B
|
||
No
|
||
≤3
|
||
High
|
||
No
|
||
A
|
||
No
|
||
> 3 AND ≤4
|
||
High
|
||
No
|
||
B
|
||
Yes
|
||
> 4
|
||
Medium
|
||
No
|
||
B
|
||
Yes
|
||
> 4
|
||
Low
|
||
Yes
|
||
B
|
||
Yes
|
||
> 4
|
||
Low
|
||
Yes
|
||
A
|
||
No
|
||
> 3 AND ≤4
|
||
Low
|
||
Yes
|
||
A
|
||
Yes
|
||
≤3
|
||
Medium
|
||
No
|
||
B
|
||
No
|
||
≤3
|
||
Low
|
||
Yes
|
||
B
|
||
Yes
|
||
> 4
|
||
Medium
|
||
Yes
|
||
B
|
||
Yes
|
||
≤3
|
||
Medium
|
||
Yes
|
||
A
|
||
Yes
|
||
> 3 AND ≤4
|
||
Medium
|
||
No
|
||
A
|
||
Yes
|
||
> 3 AND ≤4
|
||
High
|
||
Yes
|
||
B
|
||
Yes
|
||
> 4
|
||
Medium
|
||
No
|
||
A
|
||
No
|
||
(a)
|
||
[6.5 points] Given a new example with the following attribute values. Predict the value
|
||
of its “Select COMP as Major” using Na¨ıve Bayes classifier. Show all the steps.
|
||
CGPA ≤3
|
||
Interest in Computing Subjects = Medium
|
||
Practice-oriented Learner = Yes
|
||
COMP 2211 Grade = B
|
||
Answer: Let
|
||
E be CGPA ≤3, Interest in Computing Subjects = Medium, Practice-oriented
|
||
Learner = Yes, COMP 2211 Grade = B
|
||
E1 be CGPA ≤3
|
||
E2 be interest in computing subjects = medium
|
||
E3 be practice-oriented learner = yes
|
||
E4 be COMP 2211 grade = B
|
||
P(Y es|E) = P(E1|Y es)P(E2|Y es)P(E3|Y es)P(E4|Y es)P(Y es)
|
||
P(E)
|
||
P(Y es) = 9/14 = 0.643
|
||
P(E1|Y es) = 2/9 = 0.222
|
||
P(E2|Y es) = 4/9 = 0.444
|
||
P(E3|Y es) = 6/9 = 0.667
|
||
P(E4|Y es) = 6/9 = 0.667
|
||
P(Y es|E) = (0.222)(0.444)(0.667)(0.668)(0.443)
|
||
P(E)
|
||
= 0.028
|
||
P(E)
|
||
P(No|E) = P(E1|No)P(E2|No)P(E3|No)P(E4|No)P(No)
|
||
P(E)
|
||
P(No) = 5/14 = 0.356
|
||
P(E1|No) = 3/5 = 0.6
|
||
P(E2|No) = 2/5 = 0.4
|
||
P(E3|No) = 1/5 = 0.2
|
||
P(E4|No) = 2/5 = 0.4
|
||
P(No|E) = (0.6)(0.4)(0.2)(0.4)(0.357)
|
||
P(E)
|
||
= 0.007
|
||
P(E)
|
||
Hence, the Na¨ıve Bayes classifier predicts “Select COMP as Major” = Yes for the new
|
||
example.
|
||
Marking scheme:
|
||
0.5 for giving each conditional and prior probability. 6 points in total.
|
||
0.5 for giving the correct prediction.
|
||
(b)
|
||
[7.5 points] Similar to the above, but this time, predict the value of its “Select COMP
|
||
as Major” using K-nearest neighbor for K = 5. Complete the following table and state
|
||
the prediction result based on the data in the completed table. For similarity measure,
|
||
use a simple match of attribute values:
|
||
S(ai, bi) =
|
||
X
|
||
i=1
|
||
wi ∗distance(ai, bi)
|
||
where distance(ai, bi) is 0 if ai equals bi, and 1 otherwise. ai and bi are either CGPA,
|
||
interest in computing subjects, practice-oriented learner or COMP 2211 grade. Weights,
|
||
wi, are all 1 except for interest in computing subjects, it is 2.
|
||
No.
|
||
Class
|
||
Distance to New Example
|
||
No
|
||
0 + 2 + 1 + 0 = 3
|
||
No
|
||
0 + 2 + 1 + 1 = 4
|
||
Yes
|
||
1 + 2 + 1 + 0 = 4
|
||
Yes
|
||
1 + 0 + 1 + 0 = 2
|
||
Yes
|
||
1 + 2 + 0 + 0 = 3
|
||
No
|
||
1 + 2 + 0 + 1 = 4
|
||
Yes
|
||
1 + 2 + 0 + 1 = 4
|
||
No
|
||
0 + 0 + 1 + 0 = 1
|
||
Yes
|
||
0 + 2 + 0 + 0 = 2
|
||
Yes
|
||
1 + 0 + 0 + 0 = 1
|
||
Yes
|
||
0 + 0 + 0 + 1 = 1
|
||
Yes
|
||
1 + 0 + 1 + 1 = 3
|
||
Yes
|
||
1 + 2 + 0 + 0 = 3
|
||
No
|
||
1 + 0 + 1 + 1 = 3
|
||
Among the 5 nearest neighbors, 4 are from class Yes, and 1 from class No. Hence, the
|
||
KNN classifier predicts “Select COMP as Major” = Yes for the new example.
|
||
Marking scheme:
|
||
0.5 point for giving each correct answer. 7 points in total.
|
||
0.5 point for giving the correct prediction.', ARRAY['Probabilistic Models', 'KNN and Clustering']::TEXT[], 'Probabilistic Models', NULL, ARRAY['Probabilistic Models', 'KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-a', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [14 points] Multilayer Perceptron (MLP)
|
||
This problem is about multilayer perceptron (MLP). Answer all the questions below.
|
||
(a) [3 points] State when using the F1 metric is better than using accuracy as an evaluation
|
||
metric. Also use a confusion matrix to illustrate the stated situation.
|
||
(b)
|
||
[2 points] Suppose we are dealing with binary classification tasks using MLP. Explain
|
||
why it is inappropriate to use ReLU as the activation function in the output layer.
|
||
(c)
|
||
[2 points] Design the output layer of an MLP for handling a multilabel classification
|
||
problem with n classes by stating the number of neurons and the activation function.
|
||
Remark: Multilabel classification is a supervised learning problem where an instance
|
||
may be associated with multiple labels.
|
||
(d)
|
||
[3 points] Explain why it is not good to initialize all weights of an MLP to zero.
|
||
Hint: Refer to the following updating rules of weights and biases for MLP.
|
||
δk = (Ok −Tk)Ok(1 −Ok)
|
||
δj = Oj(1 −Oj) P
|
||
k∈K δkwjk
|
||
wjk ←wjk −ηδkOj
|
||
wij ←wij −ηδjOi
|
||
θj ←θj −ηδj
|
||
θk ←θk −ηδk
|
||
(e)
|
||
[2 points] Explain what will happen if the learning rate η of an MLP is
|
||
(i) Too large
|
||
(ii) Too small
|
||
(f)
|
||
[2 points] Describe a way to avoid overfitting in MLP. Explain why it works.', 14, 6, NULL::jsonb, NULL, NULL, 'Problem 3 [14 points] Multilayer Perceptron (MLP)
|
||
This problem is about multilayer perceptron (MLP). Answer all the questions below.
|
||
(a) [3 points] State when using the F1 metric is better than using accuracy as an evaluation
|
||
metric. Also use a confusion matrix to illustrate the stated situation.
|
||
Answer:
|
||
When the dataset is unbalanced.
|
||
When false-negative and false-positive matter a lot.
|
||
Actual/Predicted
|
||
Infectious disease=yes
|
||
Infectious disease=no
|
||
Infectious disease=yes
|
||
Infectious disease=no
|
||
Accuracy = 0.8016
|
||
F1 score = 0.0741
|
||
Marking scheme:
|
||
1 point for stating the situation
|
||
2 points for the confusion matrix
|
||
(b)
|
||
[2 points] Suppose we are dealing with binary classification tasks using MLP. Explain
|
||
why it is inappropriate to use ReLU as the activation function in the output layer.
|
||
Answer:
|
||
We cannot determine the cut-off threshold to distinguish between the output classes when
|
||
there is an unbounded output range.
|
||
Marking scheme:
|
||
2 points for explaining by the “unbounded” range of ReLU so cannot determine the
|
||
cut-off
|
||
1 point if mentioning the range of function (*not describing what ReLU do, but
|
||
stating the range) but no further explanation
|
||
0 point if only stating ReLU can output value more than 0 & 1 without mentioning it
|
||
has an unbounded range (bounded function beyond the range from 0 to 1 can work,
|
||
just do the mapping)
|
||
(c)
|
||
[2 points] Design the output layer of an MLP for handling a multilabel classification
|
||
problem with n classes by stating the number of neurons and the activation function.
|
||
Remark: Multilabel classification is a supervised learning problem where an instance
|
||
may be associated with multiple labels.
|
||
Answer:
|
||
n neurons and sigmoid function.
|
||
Marking scheme:
|
||
1 point for n neurons
|
||
1 point for sigmoid
|
||
(d)
|
||
[3 points] Explain why it is not good to initialize all weights of an MLP to zero.
|
||
Hint: Refer to the following updating rules of weights and biases for MLP.
|
||
δk = (Ok −Tk)Ok(1 −Ok)
|
||
δj = Oj(1 −Oj) P
|
||
k∈K δkwjk
|
||
wjk ←wjk −ηδkOj
|
||
wij ←wij −ηδjOi
|
||
θj ←θj −ηδj
|
||
θk ←θk −ηδk
|
||
Answer:
|
||
If a network is initialized with all zeros, all the neurons will propagate on the same
|
||
gradient, making different neurons learn the same features. Thus, this leads to poor
|
||
performance.
|
||
Marking scheme:
|
||
3 points if “the same update/propagate/feature learned/symmetry problem”
|
||
2 points if “zero updates” as it isn’t always true for all MLP design
|
||
2 points if stating only gradient computation are the same for neurons
|
||
1 point if only mentioning deltaj will become zero / stating gradient “may” not
|
||
update
|
||
0 point for low efficiency / poor performance
|
||
(e)
|
||
[2 points] Explain what will happen if the learning rate η of an MLP is
|
||
(i) Too large
|
||
(ii) Too small
|
||
Answer:
|
||
(i)
|
||
Cause the model to coverage too quickly to a sub-optimal solution.
|
||
Unstable training like oscillations.
|
||
Marking scheme:
|
||
1 point for explaining what will happen if the learning rate is too large.
|
||
not accept learn faster/overfit/low accuracy
|
||
(ii) Learning will be slow.
|
||
Marking scheme:
|
||
1 point for explaining what will happen if the learning rate is too small.
|
||
not accept underfit
|
||
(f)
|
||
[2 points] Describe a way to avoid overfitting in MLP. Explain why it works.
|
||
Answer:
|
||
Adding regulations helps to keep the weights small, such that it is less likely for the
|
||
model to have a large variance (i.e. be sensitive to noise and fluctuations in data).
|
||
Marking scheme:
|
||
1 point for method
|
||
1 point for explanation
|
||
only valid explanation (but not stating the definition or recalling some rule of thumbs)
|
||
get the explanation point', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-a', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [15 points] Digital Image Processing
|
||
(a) Assume we apply the following kernel to an 8-bit grayscale image.
|
||
K =
|
||
|
||
|
||
−1
|
||
−2
|
||
−2
|
||
|
||
|
||
(i)
|
||
[2 points] Determine the maximum and minimum possible values that a pixel to
|
||
which the given kernel is applied can have. Do not perform any normalization.
|
||
(ii)
|
||
[2 points] Suggest a grey-level transformation function to ensure that any output of
|
||
this kernel will be within the legal range of a standard 8-bit grayscale image.
|
||
(b) Consider the following 2 × 2 image:
|
||
Apply the following image operations sequentially.
|
||
(i)
|
||
[3 points] Show the resulting image of size 4 × 4 after adding reflection padding.
|
||
(ii) [4 points] Apply a 3×3 averaging kernel to the resulting image of part (b)(i). Assume
|
||
the output image after image averaging is in the same shape as the input image by
|
||
doing zero padding. Round the number to integer if needed.
|
||
(iii) [4 points] Compute the optimal threshold after applying ONE ITERATION of Otsu’s
|
||
method on the resulting image of part (b)(ii). Assume the initial threshold is set to
|
||
the mean pixel intensity of the resulting image.', 15, 7, NULL::jsonb, NULL, NULL, 'Problem 4 [15 points] Digital Image Processing
|
||
(a) Assume we apply the following kernel to an 8-bit grayscale image.
|
||
K =
|
||
|
||
|
||
−1
|
||
−2
|
||
−2
|
||
|
||
|
||
(i)
|
||
[2 points] Determine the maximum and minimum possible values that a pixel to
|
||
which the given kernel is applied can have. Do not perform any normalization.
|
||
Answer:
|
||
Since an 8-bit grayscale image is assumed, the highest value that each pixel can
|
||
have is 255, and the lowest value is 0.
|
||
After applying the kernel, the maximum
|
||
value is achieved when the negative values of the kernel multiply 0s and the pos-
|
||
itive values of the kernel multiply 255s. Then, the maximum achievable value is
|
||
vmax = (2 + 2 + 1)(255) = 1275. Following the same reasoning, the minimum value
|
||
will be vmin = (−2 −2 −1)(255) = −1275.
|
||
Marking scheme:
|
||
1 point for stating the maximum possible value.
|
||
1 point for stating the minimum possible value.
|
||
(ii)
|
||
[2 points] Suggest a grey-level transformation function to ensure that any output of
|
||
this kernel will be within the legal range of a standard 8-bit grayscale image.
|
||
Answer:
|
||
A 8-bit image must have a range from 0 to 255. Since the maximum and minimum
|
||
possible values for the gray levels are vmax = 1275 and vmin = −1275, respectively,
|
||
the function will be
|
||
Ioutput = 255
|
||
Iinput −vmin
|
||
vmax −vmin
|
||
|
||
= 255
|
||
Iinput −(−1275)
|
||
1275 −(−1275)
|
||
|
||
= 255
|
||
Iinput + 1275
|
||
|
||
where Iinput and Ioutput are the input and output images, respectively.
|
||
Marking scheme:
|
||
2 points for stating a transformation function.
|
||
-1 point if the function is merely returning legal range.
|
||
(b) Consider the following 2 × 2 image:
|
||
Apply the following image operations sequentially.
|
||
(i)
|
||
[3 points] Show the resulting image of size 4 × 4 after adding reflection padding.
|
||
Answer:
|
||
Marking scheme:
|
||
0.25 point for each correct value. 3 points in total.
|
||
(ii)
|
||
[4 points] Apply a 3 × 3 averaging kernel to the resulting image of part (b)(i). As-
|
||
sume the output image after image averaging is in the same shape as the input image
|
||
by doing zero padding. Round the number to integer if needed.
|
||
Answer:
|
||
Marking scheme:
|
||
0.25 point for each correct value. 4 points in total.
|
||
(iii) [4 points] Compute the optimal threshold after applying ONE ITERATION of Otsu’s
|
||
method on the resulting image of part (b)(ii). Assume the initial threshold is set to
|
||
the mean pixel intensity of the resulting image.
|
||
Answer:
|
||
Initial threshold = (0 + 2 + 4 + 4 + 4 + 9 + 12 + 10 + 8 + 15 + 18 + 14 + 8 + 14 +
|
||
16 + 12)/16 = 9.375
|
||
µ1 = (0 + 2 + 4 + 4 + 4 + 9 + 8 + 8)/8 = 4.875
|
||
µ2 = (12 + 10 + 15 + 18 + 14 + 14 + 16 + 12)/8 = 13.875
|
||
Optimal threshold = (4.875 + 13.875) = 9.375
|
||
Marking scheme:
|
||
1 point for the answer of initial threshold.
|
||
1 point for the answer of µ1.
|
||
1 point for the answer of µ2.
|
||
1 point for the optimal threshold after 1 iteration.
|
||
-1 point for incorrect input from part b(ii). I.e. incorrect calculation based on
|
||
result of part b(ii.)', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-b', '1', NULL, 1, 'long_question', 'long_answer', 'Problem 1 [19 points] Convolutional Neural Network (CNN)
|
||
(a)
|
||
[1.5 points] Suppose there is a convolutional layer in a CNN with the following parame-
|
||
ters.
|
||
Kernel size: 3 × 3
|
||
Zero padding: 1-pixel border on each side
|
||
Stride: 2-pixel in each direction
|
||
Compute the output feature map of the convolutional layer with the following input im-
|
||
age and kernel. Assume the number of channels for both input and output images is 1.
|
||
Express your answer as a 2D nested list. E.g. [[2,2],[2,2]].
|
||
Input image:
|
||
Kernel:
|
||
(b)
|
||
[1.5 points] Give an example of a data augmentation technique that would be useful for
|
||
classifying images of cats and dogs, but not for classifying handwritten digits. Briefly
|
||
explain your answer.
|
||
(c) [4.5 points] Assume we have a CNN with the architecture as described in Table 1. Com-
|
||
plete the table by filling in the input and output shape of each layer in the format: ‘height
|
||
x width x channel’ (Note: A space is placed before and after the symbol ’x’). To illustrate
|
||
that, the input shape of the 1st convolutional layer has been inserted for you.
|
||
Formula:
|
||
Convolution output size
|
||
= [ (size of image dimension - size of kernel dimension + 2 × padding) / stride ] + 1
|
||
Layer (Size, Specifications)
|
||
Input Shape
|
||
Output Shape
|
||
7x7 conv, 64 kernels, stride 2, padding 3, with biases
|
||
224 x 224 x 3
|
||
3x3 max pooling, stride 2, padding 1
|
||
3x3 conv, 128 kernels, stride 2, with biases
|
||
3x3 conv, 256 kernels, stride 2, padding 1, with biases
|
||
3x3 conv, 512 kernels, stride 2, padding 1, with biases
|
||
Table 1: Architecture of a CNN
|
||
(d)
|
||
[2 points] Assume we want to add a fully-connected layer at the back of the CNN as
|
||
described in part (b). However, the output of the resulting network still contains a large
|
||
number of parameters. State what we should do if we want to reduce the number of
|
||
parameters and summarize the output.
|
||
Hint: We want to turn N × N × 512 output to 1 × 1 × 512.
|
||
Remark: Convolution layer is not an acceptable answer.
|
||
(e)
|
||
[2.5 points] Calculate the total number of parameters of the model? (i.e. for all the
|
||
layers described in Table 1).
|
||
(f) Assume the model should classify images into 1000 distinct classes by
|
||
(i) [2.5 points] Appending the network shown in Table 2 to the end of the CNN network
|
||
(i.e. the one described in Table 1) appended with the structure that you added in
|
||
part (c).
|
||
Layer (Size, Specifications)
|
||
Input Shape
|
||
Output Shape
|
||
Flatten
|
||
1 x 1 x 512
|
||
Fully-connected (Flatten before feed-in)
|
||
Table 2: Classification network
|
||
Calculate the total number of parameters in the whole network (i.e.
|
||
the layers
|
||
described in Table 1, your answer in part (c), and the layers described in Table 2).
|
||
(ii) [2.5 points] Now, suppose we use the MLP described in Table 3 to classify the images
|
||
instead of CNN.
|
||
Layer (Size, Specifications)
|
||
Input Shape
|
||
Output Shape
|
||
Flatten
|
||
224 x 224 x 3
|
||
Fully-connected (Flatten before feed-in)
|
||
Table 3: Classification using MLP
|
||
Calculate the total number of parameters in the MLP (i.e. only those described in
|
||
Table 3).
|
||
(g)
|
||
[2 points] State what you observe in part (e). Also, state the property of CNN, which
|
||
leads to this difference.', 19, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [19 points] Convolutional Neural Network (CNN)
|
||
(a)
|
||
[1.5 points] Suppose there is a convolutional layer in a CNN with the following parame-
|
||
ters.
|
||
Kernel size: 3 × 3
|
||
Zero padding: 1-pixel border on each side
|
||
Stride: 2-pixel in each direction
|
||
Compute the output feature map of the convolutional layer with the following input im-
|
||
age and kernel. Assume the number of channels for both input and output images is 1.
|
||
Express your answer as a 2D nested list. E.g. [[2,2],[2,2]]. If your calculated answer
|
||
is a floating-point number, convert it to an integer using the floor function.
|
||
Input image:
|
||
Kernel:
|
||
Answer:
|
||
[[73]] for using the unflipped kernel OR
|
||
[[27]] for using the flipped kernel.
|
||
Marking scheme:
|
||
Accepted answers:
|
||
[[73 (39, 43, 69)]], [[27 (61, 31, 57)]] (for flipped kernel)
|
||
-1 point if the format is wrong. e.g., missing bracket(s).
|
||
If there are more than 1 value, the correct answer will be instead a 2x2 array ex-
|
||
tracted from the following arrays. There are two possible solutions:
|
||
[A[1, 1], A[1, 2]], [A[2, 1], A[2, 2]]
|
||
[A[1, 1], A[1, 3]], [A[3, 1], A[3, 3]] (stride 2 with extended zero padding)
|
||
A = [[ 2, 16, 38, 28],
|
||
[ 5, 27, 61, 53],
|
||
[ 8, 31, 57, 60],
|
||
[ 3, 13, 21, 27]]
|
||
A = [[18, 44, 22, 12],
|
||
[25, 73, 39, 17],
|
||
[22, 69, 43, 10],
|
||
[ 7, 27, 19,
|
||
3]]
|
||
(this two array is generated by
|
||
scipy.signal.convolve2d(in1, in2, mode=''full'', boundary=''fill'', fillvalue=0))
|
||
There are two possible arrays because traditionally, the former one is the “real” con-
|
||
volution. I.e. the kernel or image is flipped along both axes before the convolving op-
|
||
erations. However, at the latter one (the originally intended answer), it’s calculated
|
||
WITHOUT the flipping. This operation is called the cross-correlation operation.
|
||
(b)
|
||
[1.5 points] Give an example of a data augmentation technique that would be useful for
|
||
classifying images of cats and dogs, but not for classifying handwritten digits. Briefly
|
||
explain your answer.
|
||
Answer:
|
||
Flipping the image horizontally. Doing this to a dog/cat image would be reasonable,
|
||
but not so for an image of a handwritten digit.
|
||
Marking scheme:
|
||
2 points for “rotate and reflections” as augmentation methods.
|
||
0 point for image processing methods.
|
||
(c) [4.5 points] Assume we have a CNN with the architecture as described in Table 1. Com-
|
||
plete the table by filling in the input and output shape of each layer in the format: ‘height
|
||
x width x channel’ (Note: A space is placed before and after the symbol ’x’). To illustrate
|
||
that, the input shape of the 1st convolutional layer has been inserted for you.
|
||
Formula:
|
||
Convolution output size
|
||
= [ (size of image dimension - size of kernel dimension + 2 × padding) / stride ] + 1
|
||
If your calculated answer is a floating-point number, convert it to an integer using the
|
||
floor function.
|
||
Layer (Size, Specifications)
|
||
Input Shape
|
||
Output Shape
|
||
7x7 conv, 64 kernels, stride 2, padding 3, with biases
|
||
224 x 224 x 3
|
||
112 x 112 x 64
|
||
3x3 max pooling, stride 2, padding 1
|
||
112 x 112 x 64
|
||
56 x 56 x 64
|
||
3x3 conv, 128 kernels, stride 2, with biases
|
||
56 x 56 x 64
|
||
27 x 27 x 128
|
||
3x3 conv, 256 kernels, stride 2, padding 1, with biases
|
||
27 x 27 x 128
|
||
13 x 13 x 256
|
||
3x3 conv, 512 kernels, stride 2, padding 1, with biases
|
||
13 x 13 x 256
|
||
7 x 7 x 512
|
||
Table 1: Architecture of a CNN
|
||
Marking scheme:
|
||
0.25 point for number of channels
|
||
0.25 point for height and width
|
||
-1 point if the format is wrong, e.g. (c, h, w) instead of (h, w, c)
|
||
(d)
|
||
[2 points] Assume we want to add a fully-connected layer at the back of the CNN as
|
||
described in part (b). However, the output of the resulting network still contains a large
|
||
number of parameters. State what we should do if we want to reduce the number of
|
||
parameters and summarize the output.
|
||
Hint: We want to turn N × N × 512 output to 1 × 1 × 512.
|
||
Remark: Convolution layer is not an acceptable answer.
|
||
Answer:
|
||
Max pooling or average pooling.
|
||
Marking scheme:
|
||
2 points for giving the correct answer.
|
||
-0.5 point if type of pooling wasn’t specified.
|
||
-0.5 point if pooling is not mentioned. (e.g. taking average) As pooling is a standard
|
||
component of neural networks.
|
||
1 point for resize.
|
||
(e)
|
||
[2.5 points] Calculate the total number of parameters of the model? (i.e. for all the
|
||
layers described in Table 1).
|
||
Answer:
|
||
The total number of parameters =(7 × 7 × 3 × 64 + 64)+
|
||
(3 × 3 × 64 × 128 + 128)+
|
||
(3 × 3 × 128 × 256 + 256)+
|
||
(3 × 3 × 256 × 512 + 512)
|
||
=9472 + 73856 + 295168 + 1180160
|
||
=1558656
|
||
Marking scheme:
|
||
2.5 point for giving the correct answer.
|
||
(f) Assume the model should classify images into 1000 distinct classes by
|
||
(i) [2.5 points] Appending the network shown in Table 2 to the end of the CNN network
|
||
(i.e. the one described in Table 1) appended with the structure that you added in
|
||
part (c).
|
||
Layer (Size, Specifications)
|
||
Input Shape
|
||
Output Shape
|
||
Flatten
|
||
1 x 1 x 512
|
||
Fully-connected (Flatten before feed-in)
|
||
Table 2: Classification network
|
||
Calculate the total number of parameters in the whole network (i.e. the layers de-
|
||
scribed in Table 1, your answer in part (d), and the layers described in Table 2).
|
||
Answer:
|
||
The total number of parameters =1558656 + (512 × 1000 + 1000)
|
||
=1558656 + 513000
|
||
=2071656
|
||
Marking scheme:
|
||
2.5 point for giving the correct answer.
|
||
(ii) [2.5 points] Now, suppose we use the MLP described in Table 3 to classify the images
|
||
instead of CNN.
|
||
Layer (Size, Specifications)
|
||
Input Shape
|
||
Output Shape
|
||
Flatten
|
||
224 x 224 x 3
|
||
Fully-connected (Flatten before feed-in)
|
||
Table 3: Classification using MLP
|
||
Calculate the total number of parameters in the MLP (i.e. only those described in
|
||
Table 3).
|
||
Answer:
|
||
The total number of parameters =(150528 × 1000 + 1000)
|
||
=150529000
|
||
Marking scheme:
|
||
2.5 point for giving the correct answer.
|
||
(g)
|
||
[2 points] State what you observe in part (f). Also, state the property of CNN, which
|
||
leads to this difference.
|
||
Answer:
|
||
The number of parameters of CNN is significantly less than the number of parameters of
|
||
pure MLP. This is because CNN uses shared parameters (kernels) to process the input
|
||
instead of using 1 parameter for each individual input.
|
||
Marking scheme:
|
||
1 point for observations. If the observation is wrong, explanation is ignored.
|
||
1 point for explanation if mentioned “large scale extraction” but “feature extraction”
|
||
only gains 0.5 point.
|
||
1 point for “sparse connection”
|
||
0 point for reducing output size.
|
||
As fully-connected layer could also reduce the
|
||
output size for the following fc layer, and yet the number of parameter is even more
|
||
bloated.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-b', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [13 points] Python Programming: Convolutional Neural Network
|
||
Suppose the following CNN model learns human emotions from RGB images of faces and
|
||
classifies into 12 emotion categories.
|
||
Assumptions: All layers have no padding. Both two convolutional layers have 3 × 3 kernels.
|
||
The model is trained with Adam optimizer in default learning rate, and the loss function is
|
||
categorical cross-entropy. You don’t need to specify extra metrics like accuracy.
|
||
(a)
|
||
[9 points] According to all the information given above, write the Python codes to con-
|
||
struct and compile the model using Keras library. The following import statements are
|
||
provided for you. Also, a reference of useful Keras classes and functions are given in the
|
||
appendix.
|
||
from keras.models import Sequential
|
||
from keras.layers import Conv2D, MaxPooling2D
|
||
from keras.layers import Dense, Flatten
|
||
(b)
|
||
[4 points] Now instead of classification, a student wants to predict a single continuous
|
||
real value from an input picture: how positive (or negative) the emotion is, ranging from
|
||
0 to 1. Is CNN in general able to do that? If yes, derive a model to complete such a task
|
||
from part (a) model. Point out which parts you will change when building and compiling
|
||
the model (if any) and explain how you will change it. (You don’t have to write codes,
|
||
but state clearly your ideas). If no, also explain why.
|
||
Appendix:
|
||
Below are some Keras documentation for your reference. Some irrelevant parameters are
|
||
omitted for conciseness.
|
||
Sequential class
|
||
tf.keras.Sequential(layers=None, name=None)
|
||
Sequential groups a linear stack of layers into a tf.keras.Model
|
||
add Method
|
||
Sequential.add(layer)
|
||
– layer: layer instance
|
||
compile Method
|
||
Model.compile(optimizer="rmsprop", loss=None)
|
||
– optimizer: String (name of optimizer) or optimizer instance.
|
||
– loss: Loss function.
|
||
Conv2D class
|
||
tf.keras.layers.Conv2D(
|
||
filters, kernel_size, strides=(1, 1), padding="valid", activation=None,
|
||
)
|
||
2D convolution layer (e.g. spatial convolution over images). When using this layer as the
|
||
first layer in a model, provide the keyword argument input shape (tuple of integers, does
|
||
not include the sample axis), e.g. input shape=(128,128,3) for 128x128 RGB pictures in
|
||
data format="channels last".
|
||
filters: Integer, the dimensionality of the output space (i.e. the number of output filters
|
||
in the convolution).
|
||
kernel size: An integer or tuple/list of 2 integers, specifying the height and width of the
|
||
2D convolution window. Can be a single integer to specify the same value for all spatial
|
||
dimensions.
|
||
strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution
|
||
along the height and width. Can be a single integer to specify the same value for all
|
||
spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any
|
||
dilation rate value != 1.
|
||
padding: one of “valid” or “same” (case-insensitive).
|
||
“valid” means no padding.
|
||
“same” results in padding with zeros evenly to the left/right or up/down of the input.
|
||
When padding="same" and strides=1, the output has the same size as the input.
|
||
activation: Activation function to use. If you don’t specify anything, no activation is
|
||
applied.
|
||
MaxPooling2D class
|
||
tf.keras.layers.MaxPooling2D(
|
||
pool_size=(2, 2), strides=None, padding="valid",
|
||
)
|
||
Max pooling operation for 2D spatial data.
|
||
pool size: integer or tuple of 2 integers, window size over which to take the maximum.
|
||
(2, 2) will take the max value over a 2x2 pooling window. If only one integer is specified,
|
||
the same window length will be used for both dimensions.
|
||
strides: Integer, tuple of 2 integers, or None.
|
||
Strides values.
|
||
Specifies how far the
|
||
pooling window moves for each pooling step. If None, it will default to pool size.
|
||
padding: One of “valid” or “same” (case-insensitive).
|
||
“valid” means no padding.
|
||
“same” results in padding evenly to the left/right or up/down of the input such that
|
||
output has the same height/width dimension as the input.
|
||
Flatten class
|
||
tf.keras.layers.Flatten()
|
||
Flattens the input.
|
||
Dense class
|
||
tf.keras.layers.Dense(
|
||
units, activation=None,
|
||
)
|
||
Regular densely-connected NN layer.
|
||
units: Positive integer, dimensionality of the output space.
|
||
activation: Activation function to use. If you don’t specify anything, no activation is
|
||
applied.
|
||
Common activation functions (in shorthand strings): “relu”, “sigmoid”, “softmax”
|
||
Common loss functions (in shorthand strings):
|
||
“categorical crossentropy”,
|
||
“sparse categorical crossentropy”,
|
||
“mean squared error” (same as “MSE”),
|
||
“mean absolute error” (same as “MAE”)
|
||
Common optimizer (in shorthand strings): “adam”', 13, 5, NULL::jsonb, NULL, NULL, 'Problem 2 [13 points] Python Programming: Convolutional Neural Network
|
||
Suppose the following CNN model learns human emotions from RGB images of faces and
|
||
classifies into 12 emotion categories.
|
||
Assumptions: All layers have no padding. Both two convolutional layers have 3 × 3 kernels.
|
||
The model is trained with Adam optimizer in default learning rate, and the loss function is
|
||
categorical cross-entropy. You don’t need to specify extra metrics like accuracy.
|
||
(a)
|
||
[9 points] According to all the information given above, write the Python codes to con-
|
||
struct and compile the model using Keras library. The following import statements are
|
||
provided for you. Also, a reference of useful Keras classes and functions are given in the
|
||
appendix.
|
||
from keras.models import Sequential
|
||
from keras.layers import Conv2D, MaxPooling2D
|
||
from keras.layers import Dense, Flatten
|
||
Answer:
|
||
model = Sequential()
|
||
model.add(Conv2D(filters=32, kernel_size=(3, 3), activation=''relu'',
|
||
strides=(3,3), input_shape=(32, 32, 1)))
|
||
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation=''relu'', stride=(2,2)))
|
||
model.add(MaxPooling2D(pool_size=(2, 2)))
|
||
model.add(Flatten())
|
||
model.add(Dense(units=12, activation=''softmax''))
|
||
model.compile(optimizer=''adam'', loss=''categorical_crossentropy'')
|
||
Marking scheme:
|
||
Apply the following rules until the score is 0.
|
||
– -1 for each of Pooling, Flatten, Dense, Compile lines if it is missing or has any
|
||
wrong parameter
|
||
– -1 for each of the wrong or missing parameters in a Conv2D layer.
|
||
This in-
|
||
cludes any parameter that is good by default but you break it by setting another
|
||
absolutely wrong value.
|
||
– -1 if the input shape is not specified in any manner
|
||
– -1 for syntax errors like messed up brackets, the layers are not actually added to
|
||
a model, etc. Small typos on keywords, etc. are not penalized
|
||
(b)
|
||
[4 points] Now instead of classification, a student wants to predict a single continuous
|
||
real value from an input picture: how positive (or negative) the emotion is, ranging from
|
||
0 to 1. Is CNN in general able to do that? If yes, derive a model to complete such a task
|
||
from part (a) model. Point out which parts you will change when building and compiling
|
||
the model (if any) and explain how you will change it. (You don’t have to write codes,
|
||
but state clearly your ideas). If no, also explain why.
|
||
Answer:
|
||
Yes, CNN is able to do that in general, and the following parts need to be changed.
|
||
Change the Dense layer to units=1.
|
||
Change the Dense layer activation to any other than softmax, e.g., relu or sigmoid.
|
||
Change the loss to any regression loss, e.g., MSE, MAE.
|
||
Marking scheme:
|
||
1 point for stating CNN is able to do that in general.
|
||
1 point for changing the output Dense unit to 1
|
||
1 point for changing output activation to anything not related to probability and
|
||
range containing [0,1] (sigmoid, linear, relu, etc.)
|
||
1 point for changing the loss function to any real value comparison (subtraction),
|
||
e.g. Mean Squared Error, Mean Absolute Error, or other similar self-defined losses.
|
||
-0.5 point for each change if the student doesn’t specify to what it changes or changes
|
||
it to a wrong value.
|
||
Appendix:
|
||
Below are some Keras documentation for your reference. Some irrelevant parameters are
|
||
omitted for conciseness.
|
||
Sequential class
|
||
tf.keras.Sequential(layers=None, name=None)
|
||
Sequential groups a linear stack of layers into a tf.keras.Model
|
||
add Method
|
||
Sequential.add(layer)
|
||
– layer: layer instance
|
||
compile Method
|
||
Model.compile(optimizer="rmsprop", loss=None)
|
||
– optimizer: String (name of optimizer) or optimizer instance.
|
||
– loss: Loss function.
|
||
Conv2D class
|
||
tf.keras.layers.Conv2D(
|
||
filters, kernel_size, strides=(1, 1), padding="valid", activation=None,
|
||
)
|
||
2D convolution layer (e.g. spatial convolution over images). When using this layer as the
|
||
first layer in a model, provide the keyword argument input shape (tuple of integers, does
|
||
not include the sample axis), e.g. input shape=(128,128,3) for 128x128 RGB pictures in
|
||
data format="channels last".
|
||
filters: Integer, the dimensionality of the output space (i.e. the number of output filters
|
||
in the convolution).
|
||
kernel size: An integer or tuple/list of 2 integers, specifying the height and width of the
|
||
2D convolution window. Can be a single integer to specify the same value for all spatial
|
||
dimensions.
|
||
strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution
|
||
along the height and width. Can be a single integer to specify the same value for all
|
||
spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any
|
||
dilation rate value != 1.
|
||
padding: one of “valid” or “same” (case-insensitive).
|
||
“valid” means no padding.
|
||
“same” results in padding with zeros evenly to the left/right or up/down of the input.
|
||
When padding="same" and strides=1, the output has the same size as the input.
|
||
activation: Activation function to use. If you don’t specify anything, no activation is
|
||
applied.
|
||
MaxPooling2D class
|
||
tf.keras.layers.MaxPooling2D(
|
||
pool_size=(2, 2), strides=None, padding="valid",
|
||
)
|
||
Max pooling operation for 2D spatial data.
|
||
pool size: integer or tuple of 2 integers, window size over which to take the maximum.
|
||
(2, 2) will take the max value over a 2x2 pooling window. If only one integer is specified,
|
||
the same window length will be used for both dimensions.
|
||
strides: Integer, tuple of 2 integers, or None.
|
||
Strides values.
|
||
Specifies how far the
|
||
pooling window moves for each pooling step. If None, it will default to pool size.
|
||
padding: One of “valid” or “same” (case-insensitive).
|
||
“valid” means no padding.
|
||
“same” results in padding evenly to the left/right or up/down of the input such that
|
||
output has the same height/width dimension as the input.
|
||
Flatten class
|
||
tf.keras.layers.Flatten()
|
||
Flattens the input.
|
||
Dense class
|
||
tf.keras.layers.Dense(
|
||
units, activation=None,
|
||
)
|
||
Regular densely-connected NN layer.
|
||
units: Positive integer, dimensionality of the output space.
|
||
activation: Activation function to use. If you don’t specify anything, no activation is
|
||
applied.
|
||
Common activation functions (in shorthand strings): “relu”, “sigmoid”, “softmax”
|
||
Common loss functions (in shorthand strings):
|
||
“categorical crossentropy”,
|
||
“sparse categorical crossentropy”,
|
||
“mean squared error” (same as “MSE”),
|
||
“mean absolute error” (same as “MAE”)
|
||
Common optimizer (in shorthand strings): “adam”', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-b', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [8 points] Minimax and Alpha-Beta Pruning
|
||
Two players, MAX and MIN, are playing a game that can be represented by a tree, as shown
|
||
below.
|
||
(a)
|
||
[3.5 points] Complete the following table by estimating the minimax value of each non-
|
||
terminal node.
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
F
|
||
G
|
||
(b)
|
||
[0.5 point] State the proper move of the maximizer by writing down one of the root’s
|
||
outgoing edges (i.e. E-a or E-b).
|
||
Note: The root node of the given tree is A.
|
||
(c)
|
||
[1.5 points] State whether minimax-based AI will choose to make a move which will
|
||
result in a slower victory. Explain your answer.
|
||
(d)
|
||
[2.5 points] Suppose we now apply alpha-beta pruning on the game tree. Indicate the
|
||
edge(s) that would be pruned (eliminated from consideration) by writing down the edge
|
||
labels (i.e. E-a, E-b, E-c, . . ., E-n). You may assume that the branches are explored
|
||
from left to right.', 8, 9, NULL::jsonb, NULL, NULL, 'Problem 3 [8 points] Minimax and Alpha-Beta Pruning
|
||
Two players, MAX and MIN, are playing a game that can be represented by a tree, as shown
|
||
below.
|
||
(a)
|
||
[3.5 points] Complete the following table by estimating the minimax value of each non-
|
||
terminal node.
|
||
Answer:
|
||
A
|
||
B
|
||
C
|
||
-1
|
||
D
|
||
E
|
||
F
|
||
-1
|
||
G
|
||
Marking scheme:
|
||
0.5 point for each correct answer. 3.5 points in total.
|
||
(b)
|
||
[0.5 point] State the proper move of the maximizer by writing down one of the root’s
|
||
outgoing edges (i.e. E-a or E-b).
|
||
Note: The root node of the given tree is A.
|
||
Answer:
|
||
E-a
|
||
Marking scheme:
|
||
0.5 point for the correct answer.
|
||
(c)
|
||
[1.5 points] State whether minimax-based AI will choose to make a move which will
|
||
result in a slower victory. Explain your answer.
|
||
Answer:
|
||
Yes, minimax-based AI may choose to make a move, resulting in a slower victory. Since
|
||
we may have two moves with the same maximum minimax value, it picks the one in
|
||
slower victory.
|
||
Marking scheme:
|
||
1 point for stating minimax-based AI will choose to make a move which will result
|
||
in a slower victory.
|
||
0.5 point for giving the proper explanation.
|
||
(d)
|
||
[2.5 points] Suppose we now apply alpha-beta pruning on the game tree. Indicate the
|
||
edge(s) that would be pruned (eliminated from consideration) by writing down the edge
|
||
labels (i.e. E-a, E-b, E-c, . . ., E-n). You may assume that the branches are explored
|
||
from left to right.
|
||
Answer:
|
||
E-j and E-f would be pruned.
|
||
Marking scheme:
|
||
If E-j appears: +1 point.
|
||
If E-f appears: +1.5 point.
|
||
If E-m or E-n appears: +0 point.
|
||
If other label appears: then d) become 0 points, no matter E-j and E-f appears or
|
||
not.', ARRAY['Search and Games']::TEXT[], 'Search and Games', 'Search and Games', ARRAY['Search and Games']::TEXT[], ARRAY['tree_search', 'pruning', 'manual_tracing']::TEXT[], 'easy', '', '', ''),
|
||
('COMP2211-2022-spring-final-part-b', '4', NULL, 4, 'long_question', 'short_answer', 'Problem 4 [7 points] Ethics of Artificial Intelligence
|
||
This question consists of five sub-questions, four of them are multiple-choice questions, and
|
||
one is a short question. Choose the BEST ANSWER among the given choices for each
|
||
multiple-choice question and put your answer in the given table, while for the short question,
|
||
answer it in a few sentences.
|
||
(a)
|
||
[1 point] Ethics in artificial intelligence is
|
||
(A) Something that is not an issue.
|
||
(B) Something that somebody else will do in the future.
|
||
(C) Something that we need to apply today.
|
||
(D) Something that is entirely solved in current AI systems.
|
||
(b)
|
||
[1 point] One approach that helps developers avoid unintentionally creating bias in AI
|
||
systems is
|
||
(A) Using a wide variety of appropriately diverse data for training.
|
||
(B) Using highly specific training data from a narrow range.
|
||
(C) Not using any training data.
|
||
(D) None of the above
|
||
(c)
|
||
[1 point] What are some of the ethical concerns around artificial intelligence?
|
||
I. Racial, gender or other types of bias.
|
||
II. Loss of jobs due to AI replacing workers performing repetitive tasks.
|
||
III. Concern about the trustworthiness of decision-making supported by AI systems.
|
||
IV. Privacy, for example, as human faces are photographed and recognized in public
|
||
spaces.
|
||
(A) I and II only
|
||
(B) I, II, and IV only
|
||
(C) All of the above
|
||
(D) None of the above
|
||
(d) [1 point] What is a significantly way in which developers of AI systems can guard against
|
||
introducing bias?
|
||
(A) Using only examples from their own environment as training data.
|
||
(B) Providing effective training data and performing regular tests and audits.
|
||
(C) Using less varied AI systems and datasets.
|
||
(D) Using government approved algorithms.
|
||
Question
|
||
Answer
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
[3 points] State THREE ethical issues involved with the introduction of autonomous
|
||
vehicles.', 7, 10, NULL::jsonb, NULL, NULL, 'Problem 4 [7 points] Ethics of Artificial Intelligence
|
||
This question consists of five sub-questions, four of them are multiple-choice questions, and
|
||
one is a short question. Choose the BEST ANSWER among the given choices for each
|
||
multiple-choice question and put your answer in the given table, while for the short question,
|
||
answer it in a few sentences.
|
||
(a)
|
||
[1 point] Ethics in artificial intelligence is
|
||
(A) Something that is not an issue.
|
||
(B) Something that somebody else will do in the future.
|
||
(C) Something that we need to apply today.
|
||
(D) Something that is entirely solved in current AI systems.
|
||
(b)
|
||
[1 point] One approach that helps developers avoid unintentionally creating bias in AI
|
||
systems is
|
||
(A) Using a wide variety of appropriately diverse data for training.
|
||
(B) Using highly specific training data from a narrow range.
|
||
(C) Not using any training data.
|
||
(D) None of the above
|
||
(c)
|
||
[1 point] What are some of the ethical concerns around artificial intelligence?
|
||
I. Racial, gender or other types of bias.
|
||
II. Loss of jobs due to AI replacing workers performing repetitive tasks.
|
||
III. Concern about the trustworthiness of decision-making supported by AI systems.
|
||
IV. Privacy, for example, as human faces are photographed and recognized in public
|
||
spaces.
|
||
(A) I and II only
|
||
(B) I, II, and IV only
|
||
(C) All of the above
|
||
(D) None of the above
|
||
(d) [1 point] What is a significantly way in which developers of AI systems can guard against
|
||
introducing bias?
|
||
(A) Using only examples from their own environment as training data.
|
||
(B) Providing effective training data and performing regular tests and audits.
|
||
(C) Using less varied AI systems and datasets.
|
||
(D) Using government approved algorithms.
|
||
Question
|
||
Answer
|
||
(a)
|
||
C
|
||
(b)
|
||
A
|
||
(c)
|
||
C
|
||
(d)
|
||
B
|
||
Marking scheme:
|
||
1 point for each correct answer. 4 points in total.
|
||
(e)
|
||
[3 points] State THREE ethical issues involved with the introduction of autonomous
|
||
vehicles.
|
||
Answer:
|
||
Who is to blame in an accident?
|
||
In an emergency situation who should the car prioritize?
|
||
Increase in use of cars is bad for the environment.
|
||
Cost of the cars.
|
||
Marking scheme:
|
||
1 point for giving each correct ethical issue. 3 points in total.', ARRAY['Ethics of AI']::TEXT[], 'Ethics of AI', 'Ethics of AI', ARRAY['Ethics of AI']::TEXT[], ARRAY['concept_explanation', 'argumentation', 'comparison']::TEXT[], 'easy', '', '', ''),
|
||
('COMP2211-2023-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions
|
||
Indicate whether the following statements are true or false by circling T or F. You get 1
|
||
point for each correct answer.
|
||
(a)
|
||
T
|
||
F
|
||
Na¨ıve Bayes classifier is a probabilistic algorithm that computes the probability of an
|
||
instance belonging to each class and selects the class with the highest probability as the
|
||
output.
|
||
(b)
|
||
T
|
||
F
|
||
Na¨ıve Bayes classifier can be used for multi-class classification task.
|
||
(c)
|
||
T
|
||
F
|
||
K-Nearest Neighbors is a supervised learning and parametric algorithm that can be used
|
||
to solve both classification and regression problems.
|
||
(d)
|
||
T
|
||
F
|
||
In K-Nearest Neighbors algorithm, the value of K should always be odd to avoid ties.
|
||
(e)
|
||
T
|
||
F
|
||
In D-fold cross validation, an increase of D will result in a longer time required to cross-
|
||
validate the result.
|
||
(f)
|
||
T
|
||
F
|
||
After centroids initialization, K-Means Clustering is sensitive to the order in which the
|
||
data points are processed, meaning that changing the order of the input data points may
|
||
lead to different clustering results.
|
||
(g)
|
||
T
|
||
F
|
||
K-Median Clustering is robust to the presence of outliers and noise in the dataset, as it
|
||
uses the median of the data points as the center of each cluster.
|
||
Note: The median is the middle number in a sorted, ascending or descending list of
|
||
numbers.
|
||
(h)
|
||
T
|
||
F
|
||
A perceptron with different initialization of weights and bias may result in different
|
||
decision boundaries.
|
||
(i)
|
||
T
|
||
F
|
||
For perceptron, larger learning rates always lead to faster convergence.
|
||
(j)
|
||
T
|
||
F
|
||
Multilayer Perceptron with more layers are more expressive than Single Layer Perceptron
|
||
regardless of the activation function is linear or not.', 10, 3, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions
|
||
Indicate whether the following statements are true or false by circling T or F. You get 1
|
||
point for each correct answer.
|
||
(a)
|
||
T
|
||
F
|
||
Na¨ıve Bayes classifier is a probabilistic algorithm that computes the probability of an
|
||
instance belonging to each class and selects the class with the highest probability as the
|
||
output.
|
||
(b)
|
||
T
|
||
F
|
||
Na¨ıve Bayes classifier can be used for multi-class classification task.
|
||
(c)
|
||
T
|
||
F
|
||
K-Nearest Neighbors is a supervised learning and parametric algorithm that can be used
|
||
to solve both classification and regression problems.
|
||
(d)
|
||
T
|
||
F
|
||
In K-Nearest Neighbors algorithm, the value of K should always be odd to avoid ties.
|
||
(e)
|
||
T
|
||
F
|
||
In D-fold cross validation, an increase of D will result in a longer time required to cross-
|
||
validate the result.
|
||
(f)
|
||
T
|
||
F
|
||
After centroids initialization, K-Means Clustering is sensitive to the order in which the
|
||
data points are processed, meaning that changing the order of the input data points may
|
||
lead to different clustering results.
|
||
(g)
|
||
T
|
||
F
|
||
K-Median Clustering is robust to the presence of outliers and noise in the dataset, as it
|
||
uses the median of the data points as the center of each cluster.
|
||
Note: The median is the middle number in a sorted, ascending or descending list of
|
||
numbers.
|
||
(h)
|
||
T
|
||
F
|
||
A perceptron with different initialization of weights and bias may result in different
|
||
decision boundaries.
|
||
(i)
|
||
T
|
||
F
|
||
For perceptron, larger learning rates always lead to faster convergence.
|
||
(j)
|
||
T
|
||
F
|
||
Multilayer Perceptron with more layers are more expressive than Single Layer Perceptron
|
||
regardless of the activation function is linear or not.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2023-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [21 points] Python Fundamentals
|
||
(a)
|
||
[9 points] Consider the following Numpy arrays:
|
||
import numpy as np
|
||
# np.arange(stop)
|
||
# return an array of evenly spaced values within the half-open interval [0,stop),
|
||
# the default step size is 1.
|
||
A = np.arange(10)
|
||
B = np.array([[5, 10, 15],
|
||
[20, 25, 30],
|
||
[35, 40, 45]])
|
||
Write the output for each of the following Python statements. If the output is an empty
|
||
array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(i) print(A[1:5])
|
||
(ii) print(A[1:5:2])
|
||
(iii) print(A[5::-2])
|
||
(iv) print(B[:2,1:])
|
||
(v) print(B[A[:1]])
|
||
(vi) print(B[A[1:]])
|
||
(vii) A[A%3 == 0] = 100
|
||
print(A)
|
||
(viii) # np.mean(a, axis)
|
||
# return the average of the array elements over the specified axis.
|
||
print(np.mean(B, axis=-1))
|
||
(ix) # np.ndarray.transpose(*axis)
|
||
# return a view of the array with axes transposed.
|
||
print(B.transpose((1,0)))
|
||
(b)
|
||
[6 points] Write the output for the following Python code segments. If the output is an
|
||
empty array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(i) import numpy as np
|
||
A = np.array([1,2])
|
||
B = np.array([[1,2],
|
||
[2,4],
|
||
[3,6],
|
||
[4,8]])
|
||
print(B/A)
|
||
(ii) import numpy as np
|
||
A = np.array([[0, 0, 0],
|
||
[1, 2, 3],
|
||
[4, 5, 6],
|
||
[7, 8, 9]])
|
||
B = np.array([10, 11, 12, 13])
|
||
print(A + B)
|
||
(iii) import numpy as np
|
||
A = np.array([0, 10, 20, 30])
|
||
B = np.array([1, 2, 3])
|
||
# np.newaxis
|
||
# increase the dimension of an array by adding new axis
|
||
print(A[:, np.newaxis] + B)
|
||
(c)
|
||
[6 points] The distance function between two points x and y in the Poincare ball model
|
||
can be calculated by the following formula:
|
||
arccosh
|
||
|
||
1 + 2
|
||
||x −y||2
|
||
(1 −||x||2)(1 −||y||2)
|
||
|
||
= arccosh
|
||
|
||
1 + 2
|
||
Pn
|
||
i=1(xi −yi)2
|
||
(1 −Pn
|
||
i=1 x2
|
||
i )(1 −Pn
|
||
i=1 y2
|
||
i )
|
||
|
||
For example, if x = (0.0, 0.0), y = (0.0, 0.1), their Poincare distance is
|
||
arccosh
|
||
|
||
1 + 2
|
||
(0.0 −0.0)2 + (0.0 −0.1)2
|
||
(1 −(0.02 + 0.02))(1 −(0.02 + 0.12))
|
||
|
||
≈0.2006707
|
||
Given the following NumPy arrays, X and Y, where each 1-D array represents a data
|
||
point:
|
||
X = np.array([[0.0, 0.0], [0.1, 0.1], [0.2, 0.2]])
|
||
Y = np.array([[0, 0.1], [0.2, 0.3]])
|
||
Compute the Poincare distance between each data point in X and each data point in Y
|
||
with a one-line Python expression, such that the result of the expression is
|
||
[[0.2006707 0.75504766]
|
||
[0.2027011 0.47971794]
|
||
[0.46441642 0.22308802]]
|
||
Note:
|
||
An expression is a combination of values, variables, operators, and calls to functions.
|
||
Your expression should work with any numbers of data points in X and Y and any
|
||
number of values in the data points.
|
||
You can assume that the number of attribute values in each data point are the same
|
||
for both X and Y.
|
||
There must be no explicit loops in your expression.
|
||
You may find the following attribute or functions useful for this question.
|
||
numpy.expand_dims(a, axis)
|
||
Insert a new axis to a that will appear at the axis position in the expanded array
|
||
shape.
|
||
numpy.square(x)
|
||
Return the element-wise square of the input x.
|
||
numpy.sum(a, axis)
|
||
Return the sum of array a’s elements over a given axis.
|
||
a.T
|
||
The transposed array of a.
|
||
numpy.matmul(a, b)
|
||
Return the matrix product of a and b.
|
||
numpy.arccosh(x)
|
||
Return the element-wise arccosh of the input x.
|
||
Write your one-line Python expression below.', 21, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [21 points] Python Fundamentals
|
||
(a)
|
||
[9 points] Consider the following Numpy arrays:
|
||
import numpy as np
|
||
# np.arange(stop)
|
||
# return an array of evenly spaced values within the half-open interval [0,stop),
|
||
# the default step size is 1.
|
||
A = np.arange(10)
|
||
B = np.array([[5, 10, 15],
|
||
[20, 25, 30],
|
||
[35, 40, 45]])
|
||
Write the output for each of the following Python statements. If the output is an empty
|
||
array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(i) print(A[1:5])
|
||
Answer:
|
||
[1 2 3 4]
|
||
(ii) print(A[1:5:2])
|
||
Answer:
|
||
[1 3]
|
||
(iii) print(A[5::-2])
|
||
Answer:
|
||
[5 3 1]
|
||
(iv) print(B[:2,1:])
|
||
Answer:
|
||
[[10 15]
|
||
[25 30]]
|
||
(v) print(B[A[:1]])
|
||
Answer:
|
||
[[5 10 15]]
|
||
(vi) print(B[A[1:]])
|
||
Answer:
|
||
Error
|
||
(vii) A[A%3 == 0] = 100
|
||
print(A)
|
||
Answer:
|
||
[100 1 2 100 4 5 100 7 8 100]
|
||
(viii) # np.mean(a, axis)
|
||
# return the average of the array elements over the specified axis.
|
||
print(np.mean(B, axis=-1))
|
||
Answer:
|
||
[10 25 40]
|
||
(ix) # np.ndarray.transpose(*axis)
|
||
# return a view of the array with axes transposed.
|
||
print(B.transpose((1,0)))
|
||
Answer:
|
||
[[5 20 35]
|
||
[10 25 40]
|
||
[15 30 45]]
|
||
Marking Scheme:
|
||
1 point for each sub-question. No partial point. The brackets must be correct in
|
||
order to get the points. 9 points in total.
|
||
(b)
|
||
[6 points] Write the output for the following Python code segments. If the output is an
|
||
empty array, write “Empty Array”. If an error occurs, write “Error”.
|
||
(i) import numpy as np
|
||
A = np.array([1,2])
|
||
B = np.array([[1,2],
|
||
[2,4],
|
||
[3,6],
|
||
[4,8]])
|
||
print(B/A)
|
||
Answer:
|
||
[[1 1]
|
||
[2 2]
|
||
[3 3]
|
||
[4 4]]
|
||
(ii) import numpy as np
|
||
A = np.array([[0, 0, 0],
|
||
[1, 2, 3],
|
||
[4, 5, 6],
|
||
[7, 8, 9]])
|
||
B = np.array([10, 11, 12, 13])
|
||
print(A + B)
|
||
Answer: Error
|
||
(iii) import numpy as np
|
||
A = np.array([0, 10, 20, 30])
|
||
B = np.array([1, 2, 3])
|
||
# np.newaxis
|
||
# increase the dimension of an array by adding new axis
|
||
print(A[:, np.newaxis] + B)
|
||
Answer:
|
||
[[1 2 3]
|
||
[11 12 13]
|
||
[21 22 23]
|
||
[31 32 33]]
|
||
Marking Scheme:
|
||
2 points for each sub-question. No partial point. The bracket must be correct in
|
||
order to get the points. 6 points in total.
|
||
(c)
|
||
[6 points] The distance function between two points x and y in the Poincare ball model
|
||
can be calculated by the following formula:
|
||
arccosh
|
||
|
||
1 + 2
|
||
||x −y||2
|
||
(1 −||x||2)(1 −||y||2)
|
||
|
||
= arccosh
|
||
|
||
1 + 2
|
||
Pn
|
||
i=1(xi −yi)2
|
||
(1 −Pn
|
||
i=1 x2
|
||
i )(1 −Pn
|
||
i=1 y2
|
||
i )
|
||
|
||
For example, if x = (0.0, 0.0), y = (0.0, 0.1), their Poincare distance is
|
||
arccosh
|
||
|
||
1 + 2
|
||
(0.0 −0.0)2 + (0.0 −0.1)2
|
||
(1 −(0.02 + 0.02))(1 −(0.02 + 0.12))
|
||
|
||
≈0.2006707
|
||
Given the following NumPy arrays, X and Y, where each 1-D array represents a data
|
||
point:
|
||
X = np.array([[0.0, 0.0], [0.1, 0.1], [0.2, 0.2]])
|
||
Y = np.array([[0, 0.1], [0.2, 0.3]])
|
||
Compute the Poincare distance between each data point in X and each data point in Y
|
||
with a one-line Python expression, such that the result of the expression is
|
||
[[0.2006707 0.75504766]
|
||
[0.2027011 0.47971794]
|
||
[0.46441642 0.22308802]]
|
||
Note:
|
||
An expression is a combination of values, variables, operators, and calls to functions.
|
||
Your expression should work with any numbers of data points in X and Y and any
|
||
number of values in the data points.
|
||
You can assume that the number of attribute values in each data point are the same
|
||
for both X and Y.
|
||
There must be no explicit loops in your expression.
|
||
You may find the following attribute or functions useful for this question.
|
||
numpy.expand_dims(a, axis)
|
||
Insert a new axis to a that will appear at the axis position in the expanded array
|
||
shape.
|
||
numpy.square(x)
|
||
Return the element-wise square of the input x.
|
||
numpy.sum(a, axis)
|
||
Return the sum of array a’s elements over a given axis.
|
||
a.T
|
||
The transposed array of a.
|
||
numpy.matmul(a, b)
|
||
Return the matrix product of a and b.
|
||
numpy.arccosh(x)
|
||
Return the element-wise arccosh of the input x.
|
||
Write your one-line Python expression below.
|
||
Answer:
|
||
print(np.arccosh(1 + 2 * np.sum((np.expand_dims(X, axis=1) - Y) ** 2, axis=2) /
|
||
np.matmul(np.expand_dims(1 - np.sum(X ** 2, axis=1), axis=1),
|
||
np.expand_dims(1 - np.sum(Y ** 2, axis=1), axis=1).T)))
|
||
Marking Scheme:
|
||
1.5 points: correct usage of np.expand dims(), 0.5 points each. The axis must be
|
||
correct in order to get the points. Can be replaced by np.newaxis or other equivalent
|
||
functions.
|
||
1.5 points: correct usage of np.sum(), 0.5 points each. The axis must be correct in
|
||
order to get the points.
|
||
1.5 points: Ccorrect usage of np.square(), 0.5 points each. Can be replaced by **2.
|
||
0.5 points: correct usage of np.matmul(). The second argument should have the
|
||
transpose (np.transpose() or T.). But if the second no.expand dims() in the denom-
|
||
inator is expanded in axis = 0, the transpose should not occur. Can be replaced by
|
||
np.dot() or @.
|
||
0.5 points: correct usage of np.arccosh().
|
||
0.5 points: correct basic formula as arccosh(1 + 2*(nominator/denominator)). Gen-
|
||
erally the students can get this 0.5 points if they write their answers.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2023-spring-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [8 points] Na¨ıve Bayes Classifier
|
||
Given the following training dataset
|
||
Humidity
|
||
Wind
|
||
Temperature
|
||
Play Tennis
|
||
Day 1
|
||
High
|
||
Weak
|
||
Mild
|
||
Yes
|
||
Day 2
|
||
?
|
||
Weak
|
||
Cool
|
||
Yes
|
||
Day 3
|
||
Normal
|
||
Strong
|
||
Cool
|
||
No
|
||
Day 4
|
||
Normal
|
||
Strong
|
||
?
|
||
Yes
|
||
Day 5
|
||
High
|
||
Weak
|
||
Hot
|
||
No
|
||
Suppose we want to classify whether we will play tennis on Day 6 with the following attributes:
|
||
Humidity=High, Wind=Weak, Temperature=?
|
||
using Na¨ıve Bayes Classifier with the given training dataset.
|
||
Unfortunately, some of the training data is missing, which is denoted as “?”. Can Na¨ıve
|
||
Bayes Classifier handle this dataset with missing data? i.e., Can you still yield a classifica-
|
||
tion output of whether you will play tennis on Day 6?
|
||
If not, explain why. Otherwise, show how you will approach this problem.
|
||
In your answer, you should clearly mention why your explanation is valid with reference to
|
||
the property of the Na¨ıve Bayes algorithm.', 8, 9, NULL::jsonb, NULL, NULL, 'Problem 3 [8 points] Na¨ıve Bayes Classifier
|
||
Given the following training dataset
|
||
Humidity
|
||
Wind
|
||
Temperature
|
||
Play Tennis
|
||
Day 1
|
||
High
|
||
Weak
|
||
Mild
|
||
Yes
|
||
Day 2
|
||
?
|
||
Weak
|
||
Cool
|
||
Yes
|
||
Day 3
|
||
Normal
|
||
Strong
|
||
Cool
|
||
No
|
||
Day 4
|
||
Normal
|
||
Strong
|
||
?
|
||
Yes
|
||
Day 5
|
||
High
|
||
Weak
|
||
Hot
|
||
No
|
||
Suppose we want to classify whether we will play tennis on Day 6 with the following attributes:
|
||
Humidity=High, Wind=Weak, Temperature=?
|
||
using Na¨ıve Bayes Classifier with the given training dataset.
|
||
Unfortunately, some of the training data is missing, which is denoted as “?”. Can Na¨ıve
|
||
Bayes Classifier handle this dataset with missing data? i.e., Can you still yield a classifica-
|
||
tion output of whether you will play tennis on Day 6?
|
||
If not, explain why. Otherwise, show how you will approach this problem.
|
||
In your answer, you should clearly mention why your explanation is valid with reference to
|
||
the property of the Na¨ıve Bayes algorithm.
|
||
Answer:
|
||
Na¨ıve Bayes can handle a dataset with missing value.
|
||
Attributes are handled separately
|
||
by the algorithm at both model construction time and prediction time. Therefore, the miss-
|
||
ing attributes can simply be ignored while preparing the model, and also ignored when a
|
||
probability is calculated for a class value.
|
||
Marking Scheme:
|
||
Case 1: Student’s answer is a clear “Yes”, or suggests that the Na¨ıve Bayes can be used
|
||
in this scenario.
|
||
– 2 points for indicating/suggesting that Na¨ıve Bayes can be used in this scenario.
|
||
– 3 points for mentioning the property of the Na¨ıve Bayes, where attributes are handled
|
||
separately and/or independently.
|
||
– 3 points for mentioning the data entry with the missing value in the table can simply
|
||
be ignored.
|
||
However, answers that suggest ignoring the data for the entire day
|
||
(row), or the data for the entire attribute (column) are not acceptable.
|
||
Answer
|
||
should also include that the attribute “Temperature” for Day 6 can be ignored for
|
||
when calculating the classification result.
|
||
Case 2: Student’s answer is a clear “No”.
|
||
– 0 point, and partial credit is NOT given for the subsequent explanation. Because,
|
||
whatever explanation is for the wrong claim.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'easy', '', '', ''),
|
||
('COMP2211-2023-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors
|
||
Suppose you are building a K-Nearest Neighbors classifier that predicts user preference on
|
||
movie genre based on their rating history (a 5-point scale).
|
||
User
|
||
Movie Ratings
|
||
Preferred Genre
|
||
Movie1
|
||
Movie2
|
||
Movie3
|
||
Alice
|
||
Romance
|
||
Bob
|
||
Romance
|
||
Charlie
|
||
Action
|
||
Table 1: Training Dataset
|
||
User
|
||
Movie Ratings
|
||
Preferred Genre
|
||
Movie1
|
||
Movie2
|
||
Movie3
|
||
David
|
||
?
|
||
Table 2: Test Dataset
|
||
(a)
|
||
[8 points] Based on the movie ratings in Table 1 and Table 2, calculate the Cosine and
|
||
Euclidean distance between ratings of each training data and test data (round to the
|
||
third decimal place). Fill in the distance columns in the following table, and determine
|
||
the class of David for each distance when K=1. You can find the following approxima-
|
||
tions and equations helpful for this question.
|
||
Approximated values:
|
||
√
|
||
5 ≈2.236
|
||
√
|
||
12 ≈3.464
|
||
√
|
||
14 ≈3.742
|
||
√
|
||
√
|
||
22 =
|
||
√
|
||
242 ≈15.556
|
||
√
|
||
√
|
||
66 =
|
||
√
|
||
1452 ≈38.105
|
||
Cosine distance formula:
|
||
cosθ =
|
||
Pn
|
||
i=1 Xtrain
|
||
i
|
||
× Xtest
|
||
i
|
||
qPn
|
||
i=1(Xtrain
|
||
i
|
||
)2
|
||
pPn
|
||
i=1(Xtest
|
||
i
|
||
)2
|
||
|
||
Cosine Distance = 1 −cosθ
|
||
where Xtrain
|
||
i
|
||
and Xtest
|
||
i
|
||
are the feature value of training data and test data, respectively,
|
||
and n is the number of features.
|
||
Euclidean distance formula:
|
||
Euclidean Distance =
|
||
v
|
||
u
|
||
u
|
||
t
|
||
n
|
||
X
|
||
i=1
|
||
(Xtrain
|
||
i
|
||
−Xtest
|
||
i
|
||
)2
|
||
where Xtrain
|
||
i
|
||
and Xtest
|
||
i
|
||
are the feature value of training data and test data, respectively,
|
||
and n is the number of features.
|
||
User
|
||
Movie Ratings
|
||
Movie1
|
||
Movie2
|
||
Movie3
|
||
Preferred
|
||
Genre
|
||
Cosine
|
||
Distance
|
||
Euclidean
|
||
Distance
|
||
Alice
|
||
Romance
|
||
Bob
|
||
Romance
|
||
Charlie
|
||
Action
|
||
David
|
||
?
|
||
Class for cosine distance:
|
||
Class for Euclidean distance:
|
||
(b)
|
||
[6 points] Let’s say that Alice tends to rate movies highly, while David tends to rate
|
||
them relatively poorly. Given this scenario, which distance metric is more appropriate?
|
||
Also, describe why it makes the classifier better.', 14, 10, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors
|
||
Suppose you are building a K-Nearest Neighbors classifier that predicts user preference on
|
||
movie genre based on their rating history (a 5-point scale).
|
||
User
|
||
Movie Ratings
|
||
Preferred Genre
|
||
Movie1
|
||
Movie2
|
||
Movie3
|
||
Alice
|
||
Romance
|
||
Bob
|
||
Romance
|
||
Charlie
|
||
Action
|
||
Table 1: Training Dataset
|
||
User
|
||
Movie Ratings
|
||
Preferred Genre
|
||
Movie1
|
||
Movie2
|
||
Movie3
|
||
David
|
||
?
|
||
Table 2: Test Dataset
|
||
(a)
|
||
[8 points] Based on the movie ratings in Table 1 and Table 2, calculate the Cosine and
|
||
Euclidean distance between ratings of each training data and test data (round to the
|
||
third decimal place). Fill in the distance columns in the following table, and determine
|
||
the class of David for each distance when K=1. You can find the following approxima-
|
||
tions and equations helpful for this question.
|
||
Approximated values:
|
||
√
|
||
5 ≈2.236
|
||
√
|
||
12 ≈3.464
|
||
√
|
||
14 ≈3.742
|
||
√
|
||
√
|
||
22 =
|
||
√
|
||
242 ≈15.556
|
||
√
|
||
√
|
||
66 =
|
||
√
|
||
1452 ≈38.105
|
||
Cosine distance formula:
|
||
cosθ =
|
||
Pn
|
||
i=1 Xtrain
|
||
i
|
||
× Xtest
|
||
i
|
||
qPn
|
||
i=1(Xtrain
|
||
i
|
||
)2
|
||
pPn
|
||
i=1(Xtest
|
||
i
|
||
)2
|
||
|
||
Cosine Distance = 1 −cosθ
|
||
where Xtrain
|
||
i
|
||
and Xtest
|
||
i
|
||
are the feature value of training data and test data, respectively,
|
||
and n is the number of features.
|
||
Euclidean distance formula:
|
||
Euclidean Distance =
|
||
v
|
||
u
|
||
u
|
||
t
|
||
n
|
||
X
|
||
i=1
|
||
(Xtrain
|
||
i
|
||
−Xtest
|
||
i
|
||
)2
|
||
where Xtrain
|
||
i
|
||
and Xtest
|
||
i
|
||
are the feature value of training data and test data, respectively,
|
||
and n is the number of features.
|
||
User
|
||
Movie Ratings
|
||
Movie1
|
||
Movie2
|
||
Movie3
|
||
Preferred
|
||
Genre
|
||
Cosine
|
||
Distance
|
||
Euclidean
|
||
Distance
|
||
Alice
|
||
Romance
|
||
Bob
|
||
Romance
|
||
Charlie
|
||
Action
|
||
David
|
||
?
|
||
Class for cosine distance:
|
||
Class for Euclidean distance:
|
||
User
|
||
Movie Ratings
|
||
Movie1
|
||
Movie2
|
||
Movie3
|
||
Preferred
|
||
Genre
|
||
Cosine
|
||
Distance
|
||
Euclidean
|
||
Distance
|
||
Alice
|
||
Romance
|
||
0.003
|
||
3.464
|
||
Bob
|
||
Romance
|
||
0.029
|
||
3.741 or 3.742
|
||
Charlie
|
||
Action
|
||
0.100
|
||
2.236
|
||
David
|
||
?
|
||
Class for cosine distance: Romance
|
||
Class for Euclidean distance: Action
|
||
Marking Scheme:
|
||
1 point for giving each correct value. 6 points in total.
|
||
– 0.5 point for each correct value that is not rounded to the 3rd decimal place.
|
||
– No point for others (e.g., square root values or equations).
|
||
1 point for giving each correct class. 2 points in total.
|
||
(b)
|
||
[6 points] Let’s say that Alice tends to rate movies highly, while David tends to rate
|
||
them relatively poorly. Given this scenario, which distance metric is more appropriate?
|
||
Also, describe why it makes the classifier better.
|
||
Answer:
|
||
In this scenario, the Cosine distance metric performs better, indicating that the ground
|
||
truth class is “Romance”. The reason for this is that the Cosine distance metric con-
|
||
siders the direction of the vectors being compared, while the Euclidean distance metric
|
||
takes both direction and magnitude into account. Since the Cosine distance metric is
|
||
less sensitive to variations in magnitude, it is better suited for comparing vectors with
|
||
different scales. Given that this data contains bias in terms of scale, the Cosine distance
|
||
metric is a better choice in this case.
|
||
Marking Scheme:
|
||
1 point for stating Cosine distance metric performs better, indicating that the ground
|
||
truth class is “Romance”.
|
||
2 points for stating Cosine distance metric considers the direction, while Euclidean
|
||
distance metric takes both direction and magnitude into account.
|
||
– Note that direction or angle should be mentioned in the state of Cosine dis-
|
||
tance metric, while magnitude should be mentioned in the state of Euclidean
|
||
distance metric.
|
||
3 points for stating Cosine distance metric is less sensitive to variations in mag-
|
||
nitude, and the data contain bias in terms of scale, Cosine distance is a better
|
||
choice.
|
||
– 1 point for stating that Cosine distance metric is less sensitive to variations.
|
||
– 1 point for stating that Cosine distance metric is less sensitive to the biased data
|
||
which is described in the problem.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2023-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [19 points] K-Means Clustering
|
||
Consider a 2-dimensional dataset with the following 5 data points:
|
||
A(1, 4), B(2, 3), C(4, 6), D(5, 7), E(8, 3)
|
||
Perform K-Means clustering on this dataset with K = 2. Use the following initial centroids
|
||
for your calculations:
|
||
Centroid 1: A(1, 4), Centroid 2: D(5, 7)
|
||
If a tie occurs, assign the points to Centroid 1. All calculations round to two decimal places.
|
||
(a) [7 points] Calculate the Euclidean distances between each data point and both centroids.
|
||
Fill in the distances in the table. Then, assign each point to the nearest centroid.
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Centroid 1
|
||
Centroid 2
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
(b)
|
||
[7 points] Recalculate the centroids using the mean of the points assigned to each cen-
|
||
troid. Fill in the values of new centroids after C1 and C2 in the table. Then repeat the
|
||
process of assigning points and recalculating centroids until convergence. You may not
|
||
need all the provided table templates. Leave them blank if the algorithm has already
|
||
converged. Report the final cluster assignments and centroids.
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
C1(
|
||
,
|
||
)
|
||
C2(
|
||
,
|
||
)
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
C1(
|
||
,
|
||
)
|
||
C2(
|
||
,
|
||
)
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
C1(
|
||
,
|
||
)
|
||
C2(
|
||
,
|
||
)
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
(c) [5 points] The choice of the number of clusters K in K-Means clustering can significantly
|
||
impact the clustering results. Selecting too few or too many clusters can lead to overgen-
|
||
eralization or overfitting. Why this limitation is critical for K-Means clustering? Given
|
||
a dataset with an unknown number of clusters, please explain one way to determine a
|
||
suitable K.', 19, 12, NULL::jsonb, NULL, NULL, 'Problem 5 [19 points] K-Means Clustering
|
||
Consider a 2-dimensional dataset with the following 5 data points:
|
||
A(1, 4), B(2, 3), C(4, 6), D(5, 7), E(8, 3)
|
||
Perform K-Means clustering on this dataset with K = 2. Use the following initial centroids
|
||
for your calculations:
|
||
Centroid 1: A(1, 4), Centroid 2: D(5, 7)
|
||
If a tie occurs, assign the points to Centroid 1. All calculations round to two decimal places.
|
||
(a) [7 points] Calculate the Euclidean distances between each data point and both centroids.
|
||
Fill in the distances in the table. Then, assign each point to the nearest centroid.
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Centroid 1
|
||
Centroid 2
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
Answers:
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
Centroid 1
|
||
1.41
|
||
3.61
|
||
5.00
|
||
7.07
|
||
Centroid 2
|
||
5.00
|
||
5.00
|
||
1.41
|
||
5.00
|
||
Data points assigned to Centroid 1: A, B
|
||
Data points assigned to Centroid 2: C, D, E
|
||
Marking Scheme:
|
||
0.5 point for each correct value. 5 points in total.
|
||
1 point for giving the data points assigned to each centroid. 2 points in total.
|
||
-0.5 points for not rounding to two decimal places.
|
||
(b)
|
||
[7 points] Recalculate the centroids using the mean of the points assigned to each cen-
|
||
troid. Fill in the values of new centroids after C1 and C2 in the table. Then repeat the
|
||
process of assigning points and recalculating centroids until convergence. You may not
|
||
need all the provided table templates. Leave them blank if the algorithm has already
|
||
converged. Report the final cluster assignments and centroids.
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
C1(
|
||
,
|
||
)
|
||
C2(
|
||
,
|
||
)
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
C1(
|
||
,
|
||
)
|
||
C2(
|
||
,
|
||
)
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
C1(
|
||
,
|
||
)
|
||
C2(
|
||
,
|
||
)
|
||
Data points assigned to Centroid 1:
|
||
Data points assigned to Centroid 2:
|
||
Answer:
|
||
Distance
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
C1(1.5, 3.5)
|
||
0.71
|
||
0.71
|
||
3.54
|
||
4.95
|
||
6.52
|
||
C2(5.67, 5.33)
|
||
4.86
|
||
4.35
|
||
1.80
|
||
1.80
|
||
3.30
|
||
Data points assigned to Centroid 1: A, B
|
||
Data points assigned to Centroid 2: C, D, E
|
||
Marking Scheme:
|
||
0.5 points for each correct value. 5 points in total.
|
||
1 point for giving the data points assigned to each centroid. 2 points in total.
|
||
-0.5 points for not rounding to two decimal places.
|
||
-1 point for filling in/copying more than 1 table template (incorrect convergence
|
||
statement).
|
||
(c) [5 points] The choice of the number of clusters K in K-Means clustering can significantly
|
||
impact the clustering results. Selecting too few or too many clusters can lead to overgen-
|
||
eralization or overfitting. Why this limitation is critical for K-Means clustering? Given
|
||
a dataset with an unknown number of clusters, please explain one way to determine a
|
||
suitable K.
|
||
Answer:
|
||
Clustering is an unsupervised learning method. In the scenario where you try to adopt K-
|
||
Means clustering or any other unsupervised models, you don’t know anything about the
|
||
shape, the number of clusters, or the distribution of the dataset. Such value k is exactly
|
||
what you are looking for and will never be given beforehand. One common method to
|
||
determine the optimal number of clusters is the elbow method, which identifies the point
|
||
where adding more clusters does not improve the clustering performance significantly.
|
||
However, this method can be subjective and may not always yield the optimal number
|
||
of clusters. Other methods, such as silhouette analysis or gap statistics, can also be used
|
||
to determine the optimal number of clusters.
|
||
Marking Scheme:
|
||
2 points for stating that K-Means is an unsupervised method and the best value K
|
||
will never be given beforehand.
|
||
3 points for giving and explaining a way to determine a suitable K.
|
||
-1 point for stating why the limitation is critical for K-Means clustering but the
|
||
answer is minor.
|
||
-1 point for paraphrasing the prompt.
|
||
-1 point for using only SSE to evaluate the model and use the K at minimum SSE.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2023-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [12 points] Perceptron
|
||
Given the following training dataset and test dataset:
|
||
Training
|
||
Test
|
||
x1
|
||
-2
|
||
-1
|
||
x2
|
||
-2
|
||
-1
|
||
T
|
||
-1
|
||
-1
|
||
-1
|
||
-1
|
||
(a)
|
||
[2 points] Suppose we use the training dataset to train a perceptron. At some point
|
||
during the training, the weights and bias of the perceptron have not changed after four
|
||
consecutive updates, but the current epoch is not yet finished.
|
||
Should we stop the
|
||
training? Explain why.
|
||
(b)
|
||
[2 points]
|
||
After the training has converged, what is the range of possible accuracy on the test
|
||
dataset?
|
||
A. 50%
|
||
B. 50% ∼75%
|
||
C. 50% ∼100%
|
||
D. 25% ∼75%
|
||
E. 0% ∼100%
|
||
(c)
|
||
[2 points] Consider the following implementation of a perceptron:
|
||
import numpy as np
|
||
def fit(X, T, W0, b0, eta, max_epochs):
|
||
W = np.ones(X.shape[0]) * W0
|
||
b = b0
|
||
for epoch in range(max_epochs):
|
||
for i in range(T.shape[0]):
|
||
O = predict(X, W, b)
|
||
E = T[i] - O
|
||
W = W + E * X
|
||
b = b + E
|
||
return W, b
|
||
def predict(X, W, b):
|
||
Z = np.dot(X, W) + b
|
||
O = (Z <= 0) * 2 - 1
|
||
return O
|
||
The predict function takes three inputs:
|
||
A 1-D or 2-D NumPy array, X, storing test example(s). The shape of X is (d,) or (n,
|
||
d) where n is the number of test examples and d is the number of features;
|
||
A 1-D NumPy array, W, storing the weights of the perceptron. The shape of W is (d,)
|
||
where d is the number features;
|
||
A bias value, b, for the perceptron.
|
||
Based on how the predict function is implemented, what is the activation function f(z)
|
||
of this perceptron?
|
||
(d)
|
||
[6 points]
|
||
The fit function takes six inputs:
|
||
A 2-D NumPy array, X, storing training examples. The shape of X is (m, d) where
|
||
m is the number of training examples and d is the number of features;
|
||
A 1-D NumPy array, T, storing training targets. The shape of T is (m,) where m is
|
||
the number training examples;
|
||
An initial value, W0, for the weights of the perceptron;
|
||
An initial value, b0, for the bias of the perceptron;
|
||
The learning rate, eta;
|
||
The maximum number of training epochs, max epochs.
|
||
Given these inputs, however, the fit function is not implemented correctly. Identify all
|
||
the errors by giving the line numbers that cause the errors, and propose ways to fix them.
|
||
You may find the following attribute or functions useful for this question.
|
||
numpy.ones(shape)
|
||
Return a new array of given shape, shape, filled with ones.
|
||
range(stop)
|
||
Return a sequence of numbers starting from 0, incrementing by 1, and ending at the
|
||
value stop.
|
||
ndarray.shape
|
||
Tuple of array dimensions.
|
||
numpy.dot(a, b)
|
||
Return the dot product of a and b if a and b are both 1-D arrays. If a is an N-D
|
||
array and b is a 1-D array, it is a sum product over the last axis of a and b is
|
||
returned.', 12, 15, NULL::jsonb, NULL, NULL, 'Problem 6 [12 points] Perceptron
|
||
Given the following training dataset and test dataset:
|
||
Training
|
||
Test
|
||
x1
|
||
-2
|
||
-1
|
||
x2
|
||
-2
|
||
-1
|
||
T
|
||
-1
|
||
-1
|
||
-1
|
||
-1
|
||
(a)
|
||
[2 points] Suppose we use the training dataset to train a perceptron. At some point
|
||
during the training, the weights and bias of the perceptron have not changed after four
|
||
consecutive updates, but the current epoch is not yet finished. Should we stop the train-
|
||
ing? Explain why.
|
||
Answer:
|
||
Yes, the training has converged so we should stop it from wasting time and resources.
|
||
Marking Scheme:
|
||
1 point for stating we should stop the training.
|
||
1 point for giving correct explanations.
|
||
(b)
|
||
[2 points]
|
||
After the training has converged, what is the range of possible accuracy on the test
|
||
dataset?
|
||
A. 50%
|
||
B. 50% ∼75%
|
||
C. 50% ∼100%
|
||
D. 25% ∼75%
|
||
E. 0% ∼100%
|
||
Answer:
|
||
D
|
||
Marking Scheme:
|
||
2 points for the correct answer.
|
||
(c)
|
||
[2 points] Consider the following implementation of a perceptron:
|
||
import numpy as np
|
||
def fit(X, T, W0, b0, eta, max_epochs):
|
||
W = np.ones(X.shape[0]) * W0
|
||
b = b0
|
||
for epoch in range(max_epochs):
|
||
for i in range(T.shape[0]):
|
||
O = predict(X, W, b)
|
||
E = T[i] - O
|
||
W = W + E * X
|
||
b = b + E
|
||
return W, b
|
||
def predict(X, W, b):
|
||
Z = np.dot(X, W) + b
|
||
O = (Z <= 0) * 2 - 1
|
||
return O
|
||
The predict function takes three inputs:
|
||
A 1-D or 2-D NumPy array, X, storing test example(s). The shape of X is (d,) or (n,
|
||
d) where n is the number of test examples and d is the number of features;
|
||
A 1-D NumPy array, W, storing the weights of the perceptron. The shape of W is (d,)
|
||
where d is the number features;
|
||
A bias value, b, for the perceptron.
|
||
Based on how the predict function is implemented, what is the activation function f(z)
|
||
of this perceptron?
|
||
Answer:
|
||
f(z) =
|
||
|
||
|
||
|
||
if z ≤0
|
||
−1
|
||
otherwise
|
||
Marking Scheme:
|
||
2 points for giving the correct activation function.
|
||
(d)
|
||
[6 points]
|
||
The fit function takes six inputs:
|
||
A 2-D NumPy array, X, storing training examples. The shape of X is (m, d) where
|
||
m is the number of training examples and d is the number of features;
|
||
A 1-D NumPy array, T, storing training targets. The shape of T is (m,) where m is
|
||
the number training examples;
|
||
An initial value, W0, for the weights of the perceptron;
|
||
An initial value, b0, for the bias of the perceptron;
|
||
The learning rate, eta;
|
||
The maximum number of training epochs, max epochs.
|
||
Given these inputs, however, the fit function is not implemented correctly. Identify all
|
||
the errors by giving the line numbers that cause the errors, and propose ways to fix them.
|
||
You may find the following attribute or functions useful for this question.
|
||
numpy.ones(shape)
|
||
Return a new array of given shape, shape, filled with ones.
|
||
range(stop)
|
||
Return a sequence of numbers starting from 0, incrementing by 1, and ending at the
|
||
value stop.
|
||
ndarray.shape
|
||
Tuple of array dimensions.
|
||
numpy.dot(a, b)
|
||
Return the dot product of a and b if a and b are both 1-D arrays. If a is an N-D
|
||
array and b is a 1-D array, it is a sum product over the last axis of a and b is
|
||
returned.
|
||
Answer:
|
||
On line 4, change X.shape[0] into X.shape[1]
|
||
On line 8 and 10, change X into X[i]
|
||
On line 9, multiply the right-hand side with eta
|
||
On line 9, change T[i]-O into O-T[i], or equivalently, on line 10 and 11, change +
|
||
into -.
|
||
The resulting fit function is as follows:
|
||
def fit(X, T, W0, b0, eta, max_epochs):
|
||
W = np.ones(X.shape[1]) * W0
|
||
b = b0
|
||
for epoch in range(max_epochs):
|
||
for i in range(T.shape[0]):
|
||
O = predict(X[i], W, b)
|
||
E = eta * (O - T[i])
|
||
W = W + E * X[i]
|
||
b = b + E
|
||
return W, b
|
||
Other equivalent solutions also exist.
|
||
Marking Scheme:
|
||
2 points for each error; 1 point for identifying the error (pointing out only the line
|
||
number does not count), and 1 point for correctly fixing the error. Partial points are
|
||
given to incomplete fixes of an error.
|
||
For fixes that correctly fixed an error but accidentally introduced other errors, points
|
||
are partially deducted. No extra points are deducted for purely irrelevant/erroneous
|
||
modifications of the code.
|
||
The final mark is capped at 6 points.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2023-spring-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [16 points] Multilayer Perceptron
|
||
(a)
|
||
[4 points] Consider the following dataset
|
||
x1
|
||
x2
|
||
F(x1,x2)
|
||
Given the following single layer perceptron, where activation function f is defined as
|
||
f(z) =
|
||
(
|
||
1, for z ≥0
|
||
0, otherwise
|
||
Can we classify all the data points correctly using this perceptron? If yes, give a set of
|
||
possible parameters. If no, briefly explain the reason.
|
||
(b)
|
||
[2 points] Consider the following dataset
|
||
x1
|
||
x2
|
||
F(x1,x2)
|
||
Using the same perceptron architecture in part (a), can we classify all the data points
|
||
correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly
|
||
explain the reason.
|
||
(c)
|
||
[10 points] Consider the same dataset in part (b), given the following multilayer percep-
|
||
tron, where activation function f is defined as identical with that in part (a),
|
||
can we classify all the data points correctly using this multilayer perceptron? If yes, give
|
||
a set of possible parameters. If no, briefly explain the reason.
|
||
Hint: You don’t need to use gradient descent to calculate the results.
|
||
You may try
|
||
to find a set of parameters to make the data points linear separable after the first layer
|
||
mapping.
|
||
-------------------- END OF PAPER --------------------', 16, 18, NULL::jsonb, NULL, NULL, 'Problem 7 [16 points] Multilayer Perceptron
|
||
(a)
|
||
[4 points] Consider the following dataset
|
||
x1
|
||
x2
|
||
F(x1,x2)
|
||
Given the following single layer perceptron, where activation function f is defined as
|
||
f(z) =
|
||
(
|
||
1, for z ≥0
|
||
0, otherwise
|
||
Can we classify all the data points correctly using this perceptron? If yes, give a set of
|
||
possible parameters. If no, briefly explain the reason.
|
||
Answer:
|
||
Yes, the four data points are linearly separable.
|
||
One possible set of parameters is
|
||
w1 = 1, w2 = 1, θ = −1.5.
|
||
Marking Scheme:
|
||
1 point for ‘yes’ answer.
|
||
3 points for correct parameter (no partial points).
|
||
(b)
|
||
[2 points] Consider the following dataset
|
||
Using the same perceptron architecture in part (a), can we classify all the data points
|
||
correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly
|
||
x1
|
||
x2
|
||
F(x1,x2)
|
||
explain the reason.
|
||
Answer:
|
||
No, the 4 data points are not linearly separable, so they can’t be fit with 100% accuracy
|
||
using single layer perceptron.
|
||
Marking Scheme:
|
||
1 point for ‘yes’ answer.
|
||
3 points for correct parameters (no partial points).
|
||
(c)
|
||
[10 points] Consider the same dataset in part (b), given the following multilayer percep-
|
||
tron, where activation function f is defined as identical with that in part (a),
|
||
can we classify all the data points correctly using this multilayer perceptron? If yes, give
|
||
a set of possible parameters. If no, briefly explain the reason.
|
||
Hint: You don’t need to use gradient descent to calculate the results.
|
||
You may try
|
||
to find a set of parameters to make the data points linear separable after the first layer
|
||
mapping.
|
||
Answer:
|
||
Yes, one possible set of parameters can be obtained as following procedure:
|
||
we first use two linear functions to seperate the 4 datatpoints, e.g, w1 = −1, w2 = 1, θ1 =
|
||
−0.5, w3 = −1, w4 = 1, θ2 = 0.5, the related diagram is as followed,
|
||
(1,1)
|
||
(0,1)
|
||
(x1)
|
||
(x2)
|
||
(1,0)
|
||
(0,0)
|
||
𝑤1𝑥1 + 𝑤2𝑥2 + 𝜃1
|
||
𝑤3𝑥1 + 𝑤4𝑥2 + 𝜃2
|
||
After first layer mapping, the result will be
|
||
x1
|
||
x2
|
||
y1 = w1x1 + w2x2 + θ1
|
||
y2 = w3x1 + w4x2 + θ2
|
||
f(y1)
|
||
f(y2)
|
||
F(x1, x2)
|
||
-0.5
|
||
0.5
|
||
0.5
|
||
1.5
|
||
-1
|
||
-0.5
|
||
-0.5
|
||
0.5
|
||
For last three columns of the table, we can see the datapoints are linear separable after
|
||
the first layer, where we can set w5 = −1, w6 = 1, θ3 = −0.5 to fulfill the requirement.
|
||
To sum up, one possible set of parameters are
|
||
w1 = −1, w2 = 1, θ1 = −0.5, w3 = −1, w4 = 1, θ2 = 0.5, w5 = −1, w6 = 1, θ3 = −0.5
|
||
Marking Scheme:
|
||
1 point for ‘yes’ answer.
|
||
6 points if first layer mapping makes all data points linear separable (3 points if
|
||
the data points are ‘almost’ linear separable, e.g., one set of parameters are correct
|
||
except that the signs are opposite).
|
||
3 points for correct second layer parameters.
|
||
-------------------- END OF PAPER --------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2024-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [5 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 0.5 point for each correct answer.
|
||
Question
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)
|
||
Answer
|
||
(a) In the following code
|
||
import numpy as np
|
||
a = np.array([0, 1, 1, 2, 5, 5, 3])
|
||
b = np.array([0, 1, 2, 3, 4, 5])
|
||
c = (b == a.reshape(7, 1))
|
||
The array c has the shape (7, 1).
|
||
(b) After executing the following block of code:
|
||
import numpy as np
|
||
a = np.array([[1, 2], [3, 4], [5, 6]])
|
||
b = np.array([[1, 2, 3], [0, 0, 0], [1, 0, 0]])
|
||
c = a.dot(b)
|
||
The array c is array([[22, 28], [0, 0], [1, 2]]).
|
||
(c) The Na¨ıve Bayes Classifier operates under the assumption that the presence of a partic-
|
||
ular feature in a class is independent of the presence of any other feature.
|
||
(d) In Na¨ıve Bayes, we assume that P(B|e1, e2) = P(B|e1)P(B|e2) where B is our belief
|
||
and (e1, e2) are evidence.
|
||
(e) In Na¨ıve Bayes, given P(B = b), P(e1|B = b), and P(e2|B = b) for each possible belief
|
||
b, we can compute P(B = b′|e1, e2) for any b′.
|
||
(f) K-Nearest Neighbors Classifier CANNOT handle data with categorical features since
|
||
it is difficult to find the distance between categorical features.
|
||
(g) In K-Nearest Neighbors for binary classification, odd values of k are usually preferred.
|
||
(h) A 6-fold cross validation for K-nearest neighbors algorithm means that for each value
|
||
of K, we randomly select 1/6 of the training data as the validation set to evaluate the
|
||
model which is trained by the remaining (5/6) of the training data.
|
||
(i) The result of the K-Means Clustering DOES NOT depend on the initial centroids.
|
||
(j) It is possible that after new cluster centroids are computed by the K-Means Clustering
|
||
Algorithm, a cluster centroid may be associated with an empty cluster (i.e., with zero
|
||
points in it).', 5, 3, NULL::jsonb, NULL, NULL, 'Problem 1 [5 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 0.5 point for each correct answer.
|
||
Question
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)
|
||
Answer
|
||
(a) In the following code
|
||
import numpy as np
|
||
a = np.array([0, 1, 1, 2, 5, 5, 3])
|
||
b = np.array([0, 1, 2, 3, 4, 5])
|
||
c = (b == a.reshape(7, 1))
|
||
The array c has the shape (7, 1).
|
||
(b) After executing the following block of code:
|
||
import numpy as np
|
||
a = np.array([[1, 2], [3, 4], [5, 6]])
|
||
b = np.array([[1, 2, 3], [0, 0, 0], [1, 0, 0]])
|
||
c = a.dot(b)
|
||
The array c is array([[22, 28], [0, 0], [1, 2]]).
|
||
(c) The Na¨ıve Bayes Classifier operates under the assumption that the presence of a partic-
|
||
ular feature in a class is independent of the presence of any other feature.
|
||
(d) In Na¨ıve Bayes, we assume that P(B|e1, e2) = P(B|e1)P(B|e2) where B is our belief
|
||
and (e1, e2) are evidence.
|
||
(e) In Na¨ıve Bayes, given P(B = b), P(e1|B = b), and P(e2|B = b) for each possible belief
|
||
b, we can compute P(B = b′|e1, e2) for any b′.
|
||
(f) K-Nearest Neighbors Classifier CANNOT handle data with categorical features since
|
||
it is difficult to find the distance between categorical features.
|
||
(g) In K-Nearest Neighbors for binary classification, odd values of k are usually preferred.
|
||
(h) A 6-fold cross validation for K-nearest neighbors algorithm means that for each value
|
||
of K, we randomly select 1/6 of the training data as the validation set to evaluate the
|
||
model which is trained by the remaining (5/6) of the training data.
|
||
(i) The result of the K-Means Clustering DOES NOT depend on the initial centroids.
|
||
(j) It is possible that after new cluster centroids are computed by the K-Means Clustering
|
||
Algorithm, a cluster centroid may be associated with an empty cluster (i.e., with zero
|
||
points in it).
|
||
Question
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)
|
||
Answer
|
||
F
|
||
F
|
||
T
|
||
F
|
||
T
|
||
F
|
||
T
|
||
F
|
||
F
|
||
T', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'easy', '', '', ''),
|
||
('COMP2211-2024-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [19 points] Advanced Python for Artificial Intelligence
|
||
(a)
|
||
[13 points] Consider the following NumPy arrays:
|
||
import numpy as np
|
||
# np.arange(start, stop)
|
||
# - return an array of evenly spaced values within the half-open interval
|
||
#
|
||
[start,stop), the default step size is 1.
|
||
# np.ones(shape):
|
||
# - return a new array of given shape, filled with ones.
|
||
A = np.arange(5,15)
|
||
B = np.ones((3,2))
|
||
C = np.array([[[0,1,2,3],
|
||
[4,5,6,7]],
|
||
[[8,9,10,11],
|
||
[12,13,14,15]],
|
||
[[16,17,18,19],
|
||
[20,21,22,23]]])
|
||
Suppose the following Python statements are running consecutively. Write the output
|
||
for each of the following Python statements.
|
||
If the output is an empty array, write
|
||
“Empty Array”. If an error occurs, write “Error”.
|
||
(i) print(A[2:-2:3])
|
||
(ii) # np.ndarray.reshape(shape)
|
||
# - return an array containing the same data with a new shape.
|
||
print(B.reshape(3,-1,2))
|
||
(iii) Please write a line of Python code to get the same result in part (a)(ii) by using
|
||
np.expand dims on B.
|
||
# np.expand_dims(a, axis)
|
||
# - insert a new axis to a that will appear at the axis position in the
|
||
#
|
||
expanded array shape. Return an array that is the view of a with the
|
||
#
|
||
number of dimensions increased.
|
||
(iv) Please write a line of Python code to create the array C by using the functions
|
||
np.ndarray.reshape and np.arange().
|
||
(v) # np.mean(a, axis)
|
||
# - return a new array containing the average values of a over the specified axis.
|
||
print(np.mean(C,axis = 2))
|
||
(vi) # np.transpose(a, axes)
|
||
# - return an array with axes transposed in a.
|
||
print(np.transpose(C, (1, 0, 2)))
|
||
(vii) # a@b
|
||
# - return the matrix multiplication of the two arrays a and b.
|
||
D = A.reshape(2, 5)
|
||
print(B@D)
|
||
(viii) # np.dot(a,b)
|
||
# - return the dot product of the two arrays a and b. If both a and b are
|
||
#
|
||
1-D arrays, it is the inner product of vectors and returns a scalar.
|
||
#
|
||
If both a and b are bool arrays, the output is in bool datatype.
|
||
# np.ndarray.astype(dtype)
|
||
# - return the copy of the array with a specified dtype.
|
||
E = A < 12
|
||
F = A % 5 == 0
|
||
print(np.dot(E, F).astype(int))
|
||
(ix) print(np.dot(E, F.astype(int)))
|
||
(x) print( C / B )
|
||
(xi) # np.newaxis
|
||
# - increase the dimension of an array by adding new axis.
|
||
print(C / B[..., np.newaxis])
|
||
(xii) G = C[0, :, 2:]
|
||
print(G)
|
||
(xiii) C[0, 1, 2] = 100
|
||
print(G)
|
||
Scheme:
|
||
1 point for giving the correct answer for each part. 13 points in total.
|
||
(b)
|
||
[6 points] In recommendation systems, we often recommend items similar to what the
|
||
user likes. This is known as content-based filtering. In content-based filtering, an item is
|
||
represented as a feature vector (or a 1D array). Given the following code which computes
|
||
the cosine similarity between feature vectors of items in explicit loops:
|
||
def compute_cosine_similarity_loops(X):
|
||
num_items, num_features = X.shape
|
||
# --- BLOCK TO REWRITE ---
|
||
X_normalized = np.zeros((num_items, num_features))
|
||
for i in range(num_items):
|
||
feature_norm = np.sqrt(np.sum(X[i] ** 2))
|
||
X_normalized[i] = X[i] / feature_norm
|
||
similarities = np.zeros((num_items, num_items))
|
||
for i in range(num_items):
|
||
for j in range(num_items):
|
||
similarities[i, j] = np.sum(X_normalized[i] * X_normalized[j])
|
||
# --- BLOCK TO REWRITE ---
|
||
return similarities
|
||
X = np.array([[0, 2], [1, -1], [1, 1]])
|
||
print(compute_cosine_similarity_loops(X))
|
||
# Output:
|
||
# [[ 1.
|
||
-0.70710678
|
||
0.70710678]
|
||
#
|
||
[-0.70710678
|
||
1.
|
||
0.
|
||
]
|
||
#
|
||
[ 0.70710678
|
||
0.
|
||
1.
|
||
]]
|
||
Rewrite the block of code between the comment lines “# --- BLOCK TO REWRITE --- ”
|
||
using no explicit loops in the space provided. You may find the following functions
|
||
useful for this question.
|
||
Element-wise square of an array:
|
||
np.square(A)
|
||
– A is the input array
|
||
This is equivalent to A ** 2.
|
||
Element-wise square root of an array:
|
||
np.sqrt(A)
|
||
– A is the input array
|
||
Sum of array elements over a given axis:
|
||
np.sum(A, axis)
|
||
– A is the input array
|
||
– axis is the axis across which the array is summed
|
||
Insert a new axis of size 1 to an array:
|
||
np.expand_dims(A, axis)
|
||
– A is the input array
|
||
– axis is the position where the axis is to be inserted
|
||
If axis is 0, this is equivalent to A[np.newaxis] and A[None]. If axis is 1, this is
|
||
equivalent to A[:, np.newaxis] and A[:, None].
|
||
Transpose of an array:
|
||
np.transpose(A)
|
||
– A is the input array
|
||
This is equivalent to A.T.
|
||
Matrix multiplication:
|
||
np.matmul(A, B)
|
||
– A is the left array for matrix multiplication
|
||
– B is the right array for matrix multiplication
|
||
This is equivalent to A @ B.
|
||
More information on matrix multiplication: suppose A.shape[1] == B.shape[0],
|
||
then
|
||
C = np.matmul(A, B)
|
||
means that for each i and j,
|
||
C[i, j] == np.sum(A[i] * B[:, j])
|
||
Write your code in the space below.', 19, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [19 points] Advanced Python for Artificial Intelligence
|
||
(a)
|
||
[13 points] Consider the following NumPy arrays:
|
||
import numpy as np
|
||
# np.arange(start, stop)
|
||
# - return an array of evenly spaced values within the half-open interval
|
||
#
|
||
[start,stop), the default step size is 1.
|
||
# np.ones(shape):
|
||
# - return a new array of given shape, filled with ones.
|
||
A = np.arange(5,15)
|
||
B = np.ones((3,2))
|
||
C = np.array([[[0,1,2,3],
|
||
[4,5,6,7]],
|
||
[[8,9,10,11],
|
||
[12,13,14,15]],
|
||
[[16,17,18,19],
|
||
[20,21,22,23]]])
|
||
Suppose the following Python statements are running consecutively. Write the output
|
||
for each of the following Python statements.
|
||
If the output is an empty array, write
|
||
“Empty Array”. If an error occurs, write “Error”.
|
||
(i) print(A[2:-2:3])
|
||
Answer:
|
||
[7 10]
|
||
(ii) # np.ndarray.reshape(shape)
|
||
# - return an array containing the same data with a new shape.
|
||
print(B.reshape(3,-1,2))
|
||
Answer:
|
||
[[[1 1]]
|
||
[[1 1]]
|
||
[[1 1]]]
|
||
(iii) Please write a line of Python code to get the same result in part (a)(ii) by using
|
||
np.expand dims on B.
|
||
# np.expand_dims(a, axis)
|
||
# - insert a new axis to a that will appear at the axis position in the
|
||
#
|
||
expanded array shape. Return an array that is the view of a with the
|
||
#
|
||
number of dimensions increased.
|
||
Answer:
|
||
np.expand dims(B, 1)
|
||
(also correct if adding print())
|
||
(iv) Please write a line of Python code to create the array C by using the functions
|
||
np.ndarray.reshape and np.arange().
|
||
Answer:
|
||
np.arange(24).reshape(3, 2, 4)
|
||
(also correct if adding C = )
|
||
(v) # np.mean(a, axis)
|
||
# - return a new array containing the average values of a over the specified axis.
|
||
print(np.mean(C,axis = 2))
|
||
Answer:
|
||
[[1.5 5.5]
|
||
[9.5 13.5]
|
||
[17.5 21.5]]
|
||
(vi) # np.transpose(a, axes)
|
||
# - return an array with axes transposed in a.
|
||
print(np.transpose(C, (1, 0, 2)))
|
||
Answer:
|
||
[[[0 1 2 3]
|
||
[8 9 10 11]
|
||
[16 17 18 19]]
|
||
[[4 5 6
|
||
7]
|
||
[12 13 14 15]
|
||
[20 21 22 23]]]
|
||
(vii) # a@b
|
||
# - return the matrix multiplication of the two arrays a and b.
|
||
D = A.reshape(2, 5)
|
||
print(B@D)
|
||
Answer:
|
||
[[15 17 19 21 23]
|
||
[15 17 19 21 23]
|
||
[15 17 19 21 23]]
|
||
(viii) # np.dot(a,b)
|
||
# - return the dot product of the two arrays a and b. If both a and b are
|
||
#
|
||
1-D arrays, it is the inner product of vectors and returns a scalar.
|
||
#
|
||
If both a and b are bool arrays, the output is in bool datatype.
|
||
# np.ndarray.astype(dtype)
|
||
# - return the copy of the array with a specified dtype.
|
||
E = A < 12
|
||
F = A % 5 == 0
|
||
print(np.dot(E, F).astype(int))
|
||
Answer:
|
||
(ix) print(np.dot(E, F.astype(int)))
|
||
Answer:
|
||
(x) print( C / B )
|
||
Answer:
|
||
Error
|
||
(xi) # np.newaxis
|
||
# - increase the dimension of an array by adding new axis.
|
||
print(C / B[..., np.newaxis])
|
||
Answer:
|
||
[[[0,1,2,3],
|
||
[4,5,6,7],
|
||
[[8,9,10,11],
|
||
[12,13,14,15]],
|
||
[[16,17,18,19],
|
||
[20,21,22,23]]]
|
||
(xii) G = C[0, :, 2:]
|
||
print(G)
|
||
Answer:
|
||
[[2 3]
|
||
[6 7]]
|
||
(xiii) C[0, 1, 2] = 100
|
||
print(G)
|
||
Answer:
|
||
[[2 3]
|
||
[100 7]]
|
||
Scheme:
|
||
1 point for giving the correct answer for each part. 13 points in total.
|
||
(b)
|
||
[6 points] In recommendation systems, we often recommend items similar to what the
|
||
user likes. This is known as content-based filtering. In content-based filtering, an item is
|
||
represented as a feature vector (or a 1D array). Given the following code which computes
|
||
the cosine similarity between feature vectors of items in explicit loops:
|
||
def compute_cosine_similarity_loops(X):
|
||
num_items, num_features = X.shape
|
||
# --- BLOCK TO REWRITE ---
|
||
X_normalized = np.zeros((num_items, num_features))
|
||
for i in range(num_items):
|
||
feature_norm = np.sqrt(np.sum(X[i] ** 2))
|
||
X_normalized[i] = X[i] / feature_norm
|
||
similarities = np.zeros((num_items, num_items))
|
||
for i in range(num_items):
|
||
for j in range(num_items):
|
||
similarities[i, j] = np.sum(X_normalized[i] * X_normalized[j])
|
||
# --- BLOCK TO REWRITE ---
|
||
return similarities
|
||
X = np.array([[0, 2], [1, -1], [1, 1]])
|
||
print(compute_cosine_similarity_loops(X))
|
||
# Output:
|
||
# [[ 1.
|
||
-0.70710678
|
||
0.70710678]
|
||
#
|
||
[-0.70710678
|
||
1.
|
||
0.
|
||
]
|
||
#
|
||
[ 0.70710678
|
||
0.
|
||
1.
|
||
]]
|
||
Rewrite the block of code between the comment lines “# --- BLOCK TO REWRITE --- ”
|
||
using no explicit loops in the space provided. You may find the following functions
|
||
useful for this question.
|
||
Element-wise square of an array:
|
||
np.square(A)
|
||
– A is the input array
|
||
This is equivalent to A ** 2.
|
||
Element-wise square root of an array:
|
||
np.sqrt(A)
|
||
– A is the input array
|
||
Sum of array elements over a given axis:
|
||
np.sum(A, axis)
|
||
– A is the input array
|
||
– axis is the axis across which the array is summed
|
||
Insert a new axis of size 1 to an array:
|
||
np.expand_dims(A, axis)
|
||
– A is the input array
|
||
– axis is the position where the axis is to be inserted
|
||
If axis is 0, this is equivalent to A[np.newaxis] and A[None]. If axis is 1, this is
|
||
equivalent to A[:, np.newaxis] and A[:, None].
|
||
Transpose of an array:
|
||
np.transpose(A)
|
||
– A is the input array
|
||
This is equivalent to A.T.
|
||
Matrix multiplication:
|
||
np.matmul(A, B)
|
||
– A is the left array for matrix multiplication
|
||
– B is the right array for matrix multiplication
|
||
This is equivalent to A @ B.
|
||
More information on matrix multiplication: suppose A.shape[1] == B.shape[0],
|
||
then
|
||
C = np.matmul(A, B)
|
||
means that for each i and j,
|
||
C[i, j] == np.sum(A[i] * B[:, j])
|
||
Write your code in the space below.
|
||
Answer:
|
||
X_norm = np.sqrt(np.sum(X ** 2, 1))
|
||
# (n,)
|
||
# 2 points
|
||
X_normalized = X / X_norm[:, None]
|
||
# (n, d) # 2 points
|
||
similarities = X_normalized @ X_normalized.T
|
||
# (n, n) # 2 points
|
||
The solution is not unique.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2024-spring-midterm', '3', NULL, 3, 'long_question', 'coding', 'Problem 3 [16 points] Model Evaluation and Advanced Python Programming
|
||
In this problem, you need to implement the evaluation metrics for multi-class classifiers.
|
||
Specifically, you need to implement the confusion matrix, accuracy, precision, recall, and
|
||
macro F1 score. We provide the related definitions and formulas as follows.
|
||
(a)
|
||
[5.5 points] Suppose there is a test dataset consisting of 10 data points, their actual
|
||
classes are [2, 1, 1, 2, 0, 1, 0, 0, 1, 1], and their predicted classes by a classifier model
|
||
are [2, 1, 2, 1, 0, 2, 1, 0, 0, 0]. What is the confusion matrix for the prediction results?
|
||
What are TP, TN, FP, FN for each class?
|
||
Confusion matrix: a table that summarized actual labels and the predictions of classifi-
|
||
cation. For multi-class classification, the confusion matrix has the shape (num classes,
|
||
num classes) that records the number of occurrences between actual labels and the pre-
|
||
dictions. The classes are listed in the same order in the rows as in the columns, therefore
|
||
the correctly classified elements are located on the main diagonal.
|
||
For each class i, TPi, TNi, FPi, FNi represent the instance numbers of true positive,
|
||
true negative, false positive, and false negative, respectively.
|
||
True positive: A test result where the classifier correctly predicts the positive class
|
||
as positive.
|
||
True negative: A test result where the classifier correctly predicts the negative class
|
||
as negative.
|
||
False positive: A test result where the classifier incorrectly predicts the negative class
|
||
as positive.
|
||
False negative: A test result where the classifier incorrectly predicts the positive class
|
||
as negative.
|
||
Please fill in the confusion matrix (the rows represent actual class and the columns
|
||
represent predicted class) and the TP, TN, FP, FN table.
|
||
Predicted
|
||
Actual
|
||
Class
|
||
Class
|
||
TP
|
||
TN
|
||
FP
|
||
FN
|
||
(b)
|
||
[1 point] What is the accuracy score of the classifier model on the above test data?
|
||
Accuracy =
|
||
PN
|
||
i=1 TPi
|
||
num testdata, where N is the number of classes.
|
||
(c)
|
||
[2.5 points] What are the precisions, recalls, and F1 scores for each class of the classifier
|
||
model on the above test data? Please fill in the table using fractions or keep 3 decimals.
|
||
For each class i, Precisioni =
|
||
TPi
|
||
TPi+FPi
|
||
For each class i, Recalli =
|
||
TPi
|
||
TPi+FNi
|
||
For each class i, F1i = 2·Precisioni·Recalli
|
||
Precisioni+Recalli
|
||
Class
|
||
Precision
|
||
Recall
|
||
F1 score
|
||
(d)
|
||
[1 point] What is the macro F1 score of the classifier model on the above test data?
|
||
Please keep 3 decimals in your answer.
|
||
The macro F1 score is the unweighted mean of the F1 scores of all classes:
|
||
Macro-F1 =
|
||
PN
|
||
i=1 F1i
|
||
N
|
||
, where N is the number of classes.
|
||
(e) [6 points] Given two NumPy 1D arrays with the same shape (num_testdata,): test_actual
|
||
and test_predict, representing the actual class labels and predicted class labels for the
|
||
test data, please implement the following functions by filling in the blanks. For each
|
||
TODO, please use a one-line Python expression.
|
||
import numpy as np
|
||
def generate_confusion_matrix(test_actual, test_predict):
|
||
# TODO 1: Get num_classes, the number of classes in the test data.
|
||
# Note that the classes in the test_actual and test_predict are represented
|
||
# in integer indices from [0, 1, ..., num_classes - 1].
|
||
num_classes = _________________________________________
|
||
confusion_matrix = np.zeros((num_classes, num_classes))
|
||
# TODO 2: Get the values of confusion_matrix, where the rows represent
|
||
# actual class and the columns represent predicted class.
|
||
for i in range(0, num_classes):
|
||
for j in range(0, num_classes):
|
||
confusion_matrix[i, j] = _____________________________________
|
||
return confusion_matrix
|
||
def calculate_evaluation_metrics(test_actual, test_predict):
|
||
confusion_matrix = generate_confusion_matrix(test_actual, test_predict)
|
||
# TODO 3: Get the accuracy score, which is a scalar value.
|
||
accuracy = ______________________________________________________
|
||
# TODO 4: Get the precisions for all classes, which is a 1D array
|
||
# with shape (num_classes, ).
|
||
precision = _____________________________________________________
|
||
# TODO 5: Get the recalls for all classes, which is a 1D array
|
||
# with shape (num_classes, ).
|
||
recall = ________________________________________________________
|
||
# TODO 6: Get the macro F1 score, which is a scalar value.
|
||
macro_f1 = ______________________________________________________
|
||
return accuracy, precision, recall, macro_f1
|
||
Note:
|
||
An expression is a combination of values, variables, operators, and calls to functions.
|
||
There must be no explicit loops in your expression.
|
||
Your implemented functions should work with any number of test data points and
|
||
any number of classes.
|
||
You cannot use any variable that is not defined inside the function or any global
|
||
variable.
|
||
You may find the following attribute or functions useful for this problem.
|
||
np.max(a, axis = None)
|
||
- return the maximum of the array a along the given axis. If axis is None, the result
|
||
is a scalar value.
|
||
np.ndarray.sum(axis = None)
|
||
- return the sum of the array over the given axis. If axis is None, the result is a
|
||
scalar value.
|
||
np.ndarray.diagonal()
|
||
- if the array is 2D, then a 1D array containing the diagonal elements is returned.
|
||
np.mean(a, axis)
|
||
- return a new array containing the average values of a over the specified axis. If
|
||
axis is None, the result is a scalar value.
|
||
Write your code in the space below.', 16, 10, NULL::jsonb, NULL, NULL, 'Problem 3 [16 points] Model Evaluation and Advanced Python Programming
|
||
In this problem, you need to implement the evaluation metrics for multi-class classifiers.
|
||
Specifically, you need to implement the confusion matrix, accuracy, precision, recall, and
|
||
macro F1 score. We provide the related definitions and formulas as follows.
|
||
(a)
|
||
[5.5 points] Suppose there is a test dataset consisting of 10 data points, their actual
|
||
classes are [2, 1, 1, 2, 0, 1, 0, 0, 1, 1], and their predicted classes by a classifier model
|
||
are [2, 1, 2, 1, 0, 2, 1, 0, 0, 0]. What is the confusion matrix for the prediction results?
|
||
What are TP, TN, FP, FN for each class?
|
||
Confusion matrix: a table that summarized actual labels and the predictions of classifi-
|
||
cation. For multi-class classification, the confusion matrix has the shape (num classes,
|
||
num classes) that records the number of occurrences between actual labels and the pre-
|
||
dictions. The classes are listed in the same order in the rows as in the columns, therefore
|
||
the correctly classified elements are located on the main diagonal.
|
||
For each class i, TPi, TNi, FPi, FNi represent the instance numbers of true positive,
|
||
true negative, false positive, and false negative, respectively.
|
||
True positive: A test result where the classifier correctly predicts the positive class
|
||
as positive.
|
||
True negative: A test result where the classifier correctly predicts the negative class
|
||
as negative.
|
||
False positive: A test result where the classifier incorrectly predicts the negative class
|
||
as positive.
|
||
False negative: A test result where the classifier incorrectly predicts the positive class
|
||
as negative.
|
||
Please fill in the confusion matrix (the rows represent actual class and the columns
|
||
represent predicted class) and the TP, TN, FP, FN table.
|
||
Predicted
|
||
Actual
|
||
Class
|
||
Class
|
||
TP
|
||
TN
|
||
FP
|
||
FN
|
||
Answer:
|
||
Predicted
|
||
Actual
|
||
Class
|
||
Class
|
||
TP
|
||
TN
|
||
FP
|
||
FN
|
||
Scheme:
|
||
0.25 point for each correct numeric value. 5.25 points in total.
|
||
An extra 0.25 point is given for those who gave all the correct numeric values.
|
||
(b)
|
||
[1 point] What is the accuracy score of the classifier model on the above test data?
|
||
Accuracy =
|
||
PN
|
||
i=1 TPi
|
||
num testdata, where N is the number of classes.
|
||
Answer:
|
||
0.4
|
||
Scheme:
|
||
1 point for the correct answer.
|
||
(c)
|
||
[2.5 points] What are the precisions, recalls, and F1 scores for each class of the classifier
|
||
model on the above test data? Please fill in the table using fractions or keep 3 decimals.
|
||
For each class i, Precisioni =
|
||
TPi
|
||
TPi+FPi
|
||
For each class i, Recalli =
|
||
TPi
|
||
TPi+FNi
|
||
For each class i, F1i = 2·Precisioni·Recalli
|
||
Precisioni+Recalli
|
||
Class
|
||
Precision
|
||
Recall
|
||
F1 score
|
||
Answer:
|
||
Class
|
||
Precision
|
||
Recall
|
||
F1 score
|
||
1/2
|
||
2/3
|
||
4/7
|
||
1/3
|
||
1/5
|
||
1/4
|
||
1/3
|
||
1/2
|
||
2/5
|
||
Scheme:
|
||
0.25 point for each correct numeric value (or fraction). 2.25 points in total.
|
||
An extra 0.25 point is given for those who gave all the correct numeric values (or
|
||
fractions).
|
||
(d)
|
||
[1 point] What is the macro F1 score of the classifier model on the above test data?
|
||
Please keep 3 decimals in your answer.
|
||
The macro F1 score is the unweighted mean of the F1 scores of all classes:
|
||
Macro-F1 =
|
||
PN
|
||
i=1 F1i
|
||
N
|
||
, where N is the number of classes.
|
||
Answer:
|
||
0.407
|
||
Scheme:
|
||
1 point for the correct answer.
|
||
(e) [6 points] Given two NumPy 1D arrays with the same shape (num_testdata,): test_actual
|
||
and test_predict, representing the actual class labels and predicted class labels for the
|
||
test data, please implement the following functions by filling in the blanks. For each
|
||
TODO, please use a one-line Python expression.
|
||
import numpy as np
|
||
def generate_confusion_matrix(test_actual, test_predict):
|
||
# TODO 1: Get num_classes, the number of classes in the test data.
|
||
# Note that the classes in the test_actual and test_predict are represented
|
||
# in integer indices from [0, 1, ..., num_classes - 1].
|
||
num_classes = _________________________________________
|
||
confusion_matrix = np.zeros((num_classes, num_classes))
|
||
# TODO 2: Get the values of confusion_matrix, where the rows represent
|
||
# actual class and the columns represent predicted class.
|
||
for i in range(0, num_classes):
|
||
for j in range(0, num_classes):
|
||
confusion_matrix[i, j] = _____________________________________
|
||
return confusion_matrix
|
||
def calculate_evaluation_metrics(test_actual, test_predict):
|
||
confusion_matrix = generate_confusion_matrix(test_actual, test_predict)
|
||
# TODO 3: Get the accuracy score, which is a scalar value.
|
||
accuracy = ______________________________________________________
|
||
# TODO 4: Get the precisions for all classes, which is a 1D array
|
||
# with shape (num_classes, ).
|
||
precision = _____________________________________________________
|
||
# TODO 5: Get the recalls for all classes, which is a 1D array
|
||
# with shape (num_classes, ).
|
||
recall = ________________________________________________________
|
||
# TODO 6: Get the macro F1 score, which is a scalar value.
|
||
macro_f1 = ______________________________________________________
|
||
return accuracy, precision, recall, macro_f1
|
||
Note:
|
||
An expression is a combination of values, variables, operators, and calls to functions.
|
||
There must be no explicit loops in your expression.
|
||
Your implemented functions should work with any number of test data points and
|
||
any number of classes.
|
||
You cannot use any variable that is not defined inside the function or any global
|
||
variable.
|
||
You may find the following attribute or functions useful for this problem.
|
||
np.max(a, axis = None)
|
||
- return the maximum of the array a along the given axis. If axis is None, the result
|
||
is a scalar value.
|
||
np.ndarray.sum(axis = None)
|
||
- return the sum of the array over the given axis. If axis is None, the result is a
|
||
scalar value.
|
||
np.ndarray.diagonal()
|
||
- if the array is 2D, then a 1D array containing the diagonal elements is returned.
|
||
np.mean(a, axis)
|
||
- return a new array containing the average values of a over the specified axis. If
|
||
axis is None, the result is a scalar value.
|
||
Write your code in the space below. Answer:
|
||
TODO 1: np.max(test actual) + 1 or np.max(test predict) + 1
|
||
TODO 2: (test actual == 1) & (test predict == j)).sum()
|
||
TODO 3: confusion matrix.diagonal().sum() / confusion matrix.sum()
|
||
TODO 4: confusion matrix.diagonal() / confusion matrix.sum(axis = 0)
|
||
TODO 5: confusion matrix.diagonal() / confusion matrix.sum(axis = 1)
|
||
TODO 6: np.mean(2 * precision * recall) / (precision + recall))
|
||
Scheme:
|
||
1 point for each TODO. 6 points in total.', ARRAY['Evaluation and Validation']::TEXT[], 'Evaluation and Validation', 'Evaluation and Validation', ARRAY['Evaluation and Validation']::TEXT[], ARRAY['metric_computation', 'experimental_design', 'reasoning']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2024-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [16 points] Na¨ıve Bayes Classifier
|
||
Based on the given training data in the table below, which includes both numerical and
|
||
categorical attributes, make a prediction about the degree classification (DC) (i.e., First-Class
|
||
Honors or Second-Class Honors D1 or Second-Class Honors D2, or Third-Class Honors) of
|
||
the student with the following attribute values using Na¨ıve Bayes classifier.
|
||
Study Attitude (SA): Serious
|
||
Part-time Job (PTJ): No
|
||
Average Energy Level (AEL): 7.2
|
||
Courses Taught by Desmond and Pearl (CDP): Yes
|
||
Number of Friends in the Study Group (NFSG): 4
|
||
Student
|
||
Study
|
||
Attitude
|
||
(SA)
|
||
Categorical
|
||
Part-time
|
||
Job
|
||
(PTJ)
|
||
Categorical
|
||
Average
|
||
Energy Level
|
||
(AEL)
|
||
Numerical
|
||
Courses
|
||
Taught by
|
||
Desmond
|
||
and Pearl
|
||
(CDP)
|
||
Categorical
|
||
Number of
|
||
Friends in the
|
||
Study Group
|
||
(NFSG)
|
||
Categorical
|
||
Degree
|
||
Classification
|
||
(DC)
|
||
Serious
|
||
No
|
||
8.5
|
||
Yes
|
||
First
|
||
Moderate
|
||
Yes
|
||
6.2
|
||
No
|
||
Second D1
|
||
Serious
|
||
No
|
||
7.8
|
||
Yes
|
||
Second D1
|
||
Moderate
|
||
Yes
|
||
5.9
|
||
No
|
||
Third
|
||
Serious
|
||
No
|
||
8.1
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
6.5
|
||
Yes
|
||
Second D2
|
||
Serious
|
||
No
|
||
7.3
|
||
No
|
||
Second D1
|
||
Serious
|
||
No
|
||
7.9
|
||
Yes
|
||
First
|
||
Moderate
|
||
Yes
|
||
5.7
|
||
No
|
||
Third
|
||
Serious
|
||
No
|
||
8.2
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
6.1
|
||
Yes
|
||
Second D2
|
||
Serious
|
||
No
|
||
7.6
|
||
No
|
||
Second D2
|
||
Moderate
|
||
Yes
|
||
6.4
|
||
Yes
|
||
Third
|
||
Serious
|
||
No
|
||
8.0
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
5.8
|
||
No
|
||
Second D2
|
||
Serious
|
||
No
|
||
7.5
|
||
Yes
|
||
Second D1
|
||
Serious
|
||
No
|
||
7.7
|
||
Yes
|
||
First
|
||
Moderate
|
||
Yes
|
||
6.0
|
||
No
|
||
Third
|
||
Serious
|
||
No
|
||
7.8
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
6.3
|
||
Yes
|
||
Second D2
|
||
Assume each categorical attribute has the following possible values:
|
||
Study Attitude: Serious, Moderate, Casual
|
||
Part-time Job: Yes, No
|
||
Courses Taught by Desmond and Pearl: Yes, No
|
||
Number of Friends in the Study Group: 1, 2, 3, 4, 5, 6
|
||
Assume that the numerical data follow a Gaussian distribution:
|
||
f(x) =
|
||
σ
|
||
√
|
||
2πexp
|
||
|
||
−(x −µ)2
|
||
2σ2
|
||
|
||
where
|
||
numerical training data = (x1, x2, . . . , xn)
|
||
µ = 1
|
||
n
|
||
n
|
||
X
|
||
i=1
|
||
xi,
|
||
σ =
|
||
v
|
||
u
|
||
u
|
||
t
|
||
n −1
|
||
n
|
||
X
|
||
i=1
|
||
(xi −µ)2
|
||
If needed, apply 1-Laplace Smoothing to the likelihood probabilities of the affected feature
|
||
only. The affected feature means that the categorical feature has a category given some
|
||
belief in the test dataset, which was NOT observed in the training dataset. Please provide
|
||
all the steps.
|
||
You may find the following equation useful for this question:
|
||
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) · · · P(ed|Bi))
|
||
.(a)
|
||
[4 points] Calculate the mean (µ) and standard deviation (σ) of the Average Energy
|
||
Level (AEL) of Degree Classification: First-Class Honors and Second-Class Honors D1.
|
||
(b)
|
||
[3 points] Calculate the test data sample’s likelihoods of Average Energy Level (AEL)
|
||
of First-Class Honors and Second-Class Honors D1.
|
||
(c) [4 points] Calculate the test data sample’s likelihoods of Study Attitude (SA), Part-time
|
||
Job (PTJ), Courses Taught by Desmond and Pearl (CDP), and Number of Friends in
|
||
the Study Group (NFSG) of all Degree Classification(s) (DC).
|
||
(d)
|
||
[2 points] Calculate the prior probabilities.
|
||
(e)
|
||
[3 points] Finally, calculate the posterior probabilities and make the prediction.
|
||
Assume that the likelihood of Average Energy Level (AEL) of Second-Class Honors D2
|
||
and Third-Class Honors are:
|
||
P(AEL=7.2|Second-Class Honors D2) = 0.32515
|
||
P(AEL=7.2|Third-Class Honors) = 0.000334158', 16, 14, NULL::jsonb, NULL, NULL, 'Problem 4 [16 points] Na¨ıve Bayes Classifier
|
||
Based on the given training data in the table below, which includes both numerical and
|
||
categorical attributes, make a prediction about the degree classification (DC) (i.e., First-Class
|
||
Honors or Second-Class Honors D1 or Second-Class Honors D2, or Third-Class Honors) of
|
||
the student with the following attribute values using Na¨ıve Bayes classifier.
|
||
Study Attitude (SA): Serious
|
||
Part-time Job (PTJ): No
|
||
Average Energy Level (AEL): 7.2
|
||
Courses Taught by Desmond and Pearl (CDP): Yes
|
||
Number of Friends in the Study Group (NFSG): 4
|
||
Student
|
||
Study
|
||
Attitude
|
||
(SA)
|
||
Categorical
|
||
Part-time
|
||
Job
|
||
(PTJ)
|
||
Categorical
|
||
Average
|
||
Energy Level
|
||
(AEL)
|
||
Numerical
|
||
Courses
|
||
Taught by
|
||
Desmond
|
||
and Pearl
|
||
(CDP)
|
||
Categorical
|
||
Number of
|
||
Friends in the
|
||
Study Group
|
||
(NFSG)
|
||
Categorical
|
||
Degree
|
||
Classification
|
||
(DC)
|
||
Serious
|
||
No
|
||
8.5
|
||
Yes
|
||
First
|
||
Moderate
|
||
Yes
|
||
6.2
|
||
No
|
||
Second D1
|
||
Serious
|
||
No
|
||
7.8
|
||
Yes
|
||
Second D1
|
||
Moderate
|
||
Yes
|
||
5.9
|
||
No
|
||
Third
|
||
Serious
|
||
No
|
||
8.1
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
6.5
|
||
Yes
|
||
Second D2
|
||
Serious
|
||
No
|
||
7.3
|
||
No
|
||
Second D1
|
||
Serious
|
||
No
|
||
7.9
|
||
Yes
|
||
First
|
||
Moderate
|
||
Yes
|
||
5.7
|
||
No
|
||
Third
|
||
Serious
|
||
No
|
||
8.2
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
6.1
|
||
Yes
|
||
Second D2
|
||
Serious
|
||
No
|
||
7.6
|
||
No
|
||
Second D2
|
||
Moderate
|
||
Yes
|
||
6.4
|
||
Yes
|
||
Third
|
||
Serious
|
||
No
|
||
8.0
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
5.8
|
||
No
|
||
Second D2
|
||
Serious
|
||
No
|
||
7.5
|
||
Yes
|
||
Second D1
|
||
Serious
|
||
No
|
||
7.7
|
||
Yes
|
||
First
|
||
Moderate
|
||
Yes
|
||
6.0
|
||
No
|
||
Third
|
||
Serious
|
||
No
|
||
7.8
|
||
Yes
|
||
First
|
||
Casual
|
||
Yes
|
||
6.3
|
||
Yes
|
||
Second D2
|
||
Assume each categorical attribute has the following possible values:
|
||
Study Attitude: Serious, Moderate, Casual
|
||
Part-time Job: Yes, No
|
||
Courses Taught by Desmond and Pearl: Yes, No
|
||
Number of Friends in the Study Group: 1, 2, 3, 4, 5, 6
|
||
Assume that the numerical data follow a Gaussian distribution:
|
||
f(x) =
|
||
σ
|
||
√
|
||
2πexp
|
||
|
||
−(x −µ)2
|
||
2σ2
|
||
|
||
where
|
||
numerical training data = (x1, x2, . . . , xn)
|
||
µ = 1
|
||
n
|
||
n
|
||
X
|
||
i=1
|
||
xi,
|
||
σ =
|
||
v
|
||
u
|
||
u
|
||
t
|
||
n −1
|
||
n
|
||
X
|
||
i=1
|
||
(xi −µ)2
|
||
If needed, apply 1-Laplace Smoothing to the likelihood probabilities of the affected feature
|
||
only. The affected feature means that the categorical feature has a category given some
|
||
belief in the test dataset, which was NOT observed in the training dataset. Please provide
|
||
all the steps.
|
||
You may find the following equation useful for this question:
|
||
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) · · · P(ed|Bi))
|
||
.(a)
|
||
[4 points] Calculate the mean (µ) and standard deviation (σ) of the Average Energy
|
||
Level (AEL) of Degree Classification: First-Class Honors and Second-Class Honors D1.
|
||
Answer:
|
||
Mean of average energy level given first-class honors = 8.02857
|
||
Standard deviation of average energy level given first-class honors = 0.26904
|
||
Mean of average energy level given second-class honors, D1 = 7.2
|
||
Standard deviation of average energy level given second-class honors, D1 = 0.69761
|
||
Scheme:
|
||
1 point for each correct mean value. 2 points in total.
|
||
1 point for each correct standard deviation value. 2 points in totals.
|
||
(b)
|
||
[3 points] Calculate the test data sample’s likelihoods of Average Energy Level (AEL)
|
||
of First-Class Honors and Second-Class Honors D1.
|
||
Answer:
|
||
P(AEL = 7.2|First) =
|
||
0.26904
|
||
√
|
||
2πexp
|
||
|
||
−(7.2 −8.02857)2
|
||
2(0.26904)2
|
||
|
||
= 0.01293
|
||
P(AEL = 7.2|SecondD1) =
|
||
0.69761
|
||
√
|
||
2πexp
|
||
|
||
−(7.2 −7.2)2
|
||
2(0.69761)2
|
||
|
||
= 0.57187
|
||
Scheme:
|
||
1.5 points for each correct test data sample’s likelihood. 3 points in total.
|
||
(c) [4 points] Calculate the test data sample’s likelihoods of Study Attitude (SA), Part-time
|
||
Job (PTJ), Courses Taught by Desmond and Pearl (CDP), and Number of Friends in
|
||
the Study Group (NFSG) of all Degree Classification(s) (DC).
|
||
Answer:
|
||
As P(SA = Serious|Third) = 0, we need to apply a add-one-trick for the study
|
||
attitude. We assume that three values (Serious, Moderate, Casual) are equally prob-
|
||
able:
|
||
– P(SA = Serious|First) = 7+1
|
||
7+3 = 0.8
|
||
– P(SA = Serious|SecondD1) = 3+1
|
||
4+3 = 0.57
|
||
– P(SA = Serious|SecondD2) = 1+1
|
||
5+3 = 0.25
|
||
– P(SA = Serious|Third) = 0+1
|
||
4+3 = 0.1429
|
||
As P(PTJ = No|Third) = 0, we need to apply a add-one-trick for the Part-time
|
||
Job. We assume that three values (Yes, No) are equally probable:
|
||
– P(PTJ = No|First) = 7+1
|
||
7+2 = 0.89
|
||
– P(PTJ = No|SecondD1) = 3+1
|
||
4+2 = 0.67
|
||
– P(PTJ = No|SecondD2) = 1+1
|
||
5+2 = 0.286
|
||
– P(PTJ = No|Third) = 0+1
|
||
4+2 = 0.167
|
||
P(CDP = Y es|First) = 7
|
||
7 = 1
|
||
P(CDP = Y es|SecondD1) = 2
|
||
4 = 0.5
|
||
P(CDP = Y es|SecondD2) = 3
|
||
5 = 0.6
|
||
P(CDP = Y es|Third) = 1
|
||
4 = 0.25
|
||
As P(NFSG = 4|First) = 0, we need to apply a add-one-trick for the Number
|
||
of friends in the Study Group. We assume that six values (1,2,3,4,5,6) are equally
|
||
probable:
|
||
– P(NFSG = 4|First) = 0+1
|
||
7+6 = 0.0769
|
||
– P(NFSG = 4|SecondD1) = 3+1
|
||
4+6 = 0.4
|
||
– P(NFSG = 4|SecondD2) = 1+1
|
||
5+6 = 0.18
|
||
– P(NFSG = 4|Third) = 0+1
|
||
4+6 = 0.1
|
||
Scheme:
|
||
0.25 point for each correct test data sample’s likelihood. 4 points in total.
|
||
(d)
|
||
[2 points] Calculate the prior probabilities.
|
||
Answer:
|
||
P(First) = 7
|
||
20 = 0.35
|
||
P(SecondD1) = 4
|
||
20 = 0.2
|
||
P(SecondD2) = 5
|
||
20 = 0.25
|
||
P(Third) = 4
|
||
20 = 0.2
|
||
Scheme:
|
||
0.5 point for each correct prior probability. 2 points in total.
|
||
(e)
|
||
[3 points] Finally, calculate the posterior probabilities and make the prediction.
|
||
Assume that the likelihood of Average Energy Level (AEL) of Second-Class Honors D2
|
||
and Third-Class Honors are:
|
||
P(AEL=7.2|Second-Class Honors D2) = 0.32515
|
||
P(AEL=7.2|Third-Class Honors) = 0.000334158
|
||
Answer:
|
||
P(First|E) = (0.35)(0.8)(0.89)(0.01293)(1)(0.0769)
|
||
P(E)
|
||
= 0.0002478
|
||
P(E)
|
||
P(SecondD1|E) = (0.2)(0.57)(0.67)(0.57187)(0.5)(0.4)
|
||
P(E)
|
||
= 0.0087359
|
||
P(E)
|
||
P(SecondD2|E) = (0.25)(0.25)(0.286)(0.32515)(0.6)(0.18)
|
||
P(E)
|
||
= 0.0006277
|
||
P(E)
|
||
P(Third|E) = (0.2)(0.1429)(0.167)(0.000334158)(0.25)(0.1)
|
||
P(E)
|
||
= 3.9872×10−8
|
||
P(E)
|
||
Therefore, the Na¨ıve Bayes classifier predicts “Degree Classification” = Second-Class
|
||
Honors D2 for the student.
|
||
Scheme:
|
||
0.5 point for each correct posterior probability. 2 points in total.
|
||
1 point for making the correct prediction.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2024-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 Part(b) Continued
|
||
(c)
|
||
[2 points] Based on the chosen K and those 6 training samples in part (b), calculate the
|
||
test error for the test dataset below.
|
||
Note:
|
||
When selecting a neighbor, to resolve ties, choose the neighbor with the lowest index.
|
||
When evaluating the error, use
|
||
– Number of wrong predictions / Number of test data points
|
||
.
|
||
Attribute 1
|
||
Attribute 2
|
||
Class
|
||
3.4
|
||
B
|
||
2.8
|
||
A
|
||
3.5
|
||
A
|
||
2.5
|
||
B', 18, 19, NULL::jsonb, NULL, NULL, 'Problem 5 Part(b) Continued
|
||
(c)
|
||
[2 points] Based on the chosen K and those 6 training samples in part (b), calculate the
|
||
test error for the test dataset below.
|
||
Note:
|
||
When selecting a neighbor, to resolve ties, choose the neighbor with the lowest index.
|
||
When evaluating the error, use
|
||
– Number of wrong predictions / Number of test data points
|
||
.
|
||
Attribute 1
|
||
Attribute 2
|
||
Class
|
||
3.4
|
||
B
|
||
2.8
|
||
A
|
||
3.5
|
||
A
|
||
2.5
|
||
B
|
||
Answer:
|
||
Using K=3, test error is 0.
|
||
Details:
|
||
**Test Data**
|
||
([3.4, 3.0], B) k nearest neighbors, [[3.2, 2.9], B, [2.7, 3.0], A, [3.4, 3.8], B] predict label B error: 0
|
||
([3.0, 2.8], A) k nearest neighbors, [[3.2, 2.9], B, [2.7, 3.0], A, [2.6, 2.0], A] predict label A error: 0
|
||
([2.0, 3.5], A) k nearest neighbors, [[2.5, 3.7], A, [2.7, 3.0], A, [3.4, 3.8], B] predict label A error: 0
|
||
([2.5, 7.0], B) k nearest neighbors, [[2.5, 3.7], A, [3.5, 4.0], B, [3.4, 3.8], B] predict label B error: 0
|
||
total error for test data: 0/4
|
||
Scheme:
|
||
2 points for giving the total error for 6-cross validation for K = 3.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2024-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [16 points] Leader Clustering
|
||
Consider the following cluster method called Leader Clustering. It receives two parameters:
|
||
an integer K and a real-value threshold T. Similar to K-means clustering, it starts by selecting
|
||
K instance (which will be called leaders) and assigns each training instance to the cluster of
|
||
the closest leader. During this assignment step, however, if the distance of a training instance
|
||
to its closest leader is greater than the input threshold T, then this training instance forms a
|
||
new cluster and becomes the initial leader of this new cluster. After all the training instances
|
||
have been assigned to a cluster, the leader of each cluster is updated as the mean of the
|
||
cluster. The process is then repeated until the cluster assignments do not change.
|
||
(a)
|
||
[6 points] Given a 1-dimensional data set { 1, 3, 5, 9, 11, 13, 15 }, use the Leader
|
||
Clustering algorithm and Euclidean distance to cluster the given points in the data set
|
||
into 2 clusters. Assume c1 = 3 and c2 = 11 are chosen as the initial K = 2 leaders
|
||
and the threshold for forming new clusters T = 5. Fill in the following table of the first
|
||
assignment iteration with your completed values and what are the leaders after the first
|
||
assignment iteration? (If new clusters are formed in the process, named their leaders as
|
||
c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s)
|
||
are not formed yet.)
|
||
Data
|
||
point
|
||
Distance
|
||
between the
|
||
data point and c1
|
||
Distance
|
||
between the
|
||
data point and c2
|
||
Distance
|
||
between the
|
||
data point and c3
|
||
(if needed)
|
||
Distance
|
||
between the
|
||
data point and c4
|
||
(if needed)
|
||
Closest
|
||
Centroid
|
||
The leaders after the first assignment iteration:
|
||
(b) [6 points] Given a 1-dimensional data set {5, 9, 11, 13, 17, 19}, use the Leader Clustering
|
||
algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters.
|
||
Assume c1 = 5 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for
|
||
forming new clusters T = 5. Fill in the following table of the first assignment iteration
|
||
with your computed values and what are the leaders after the first assignment iteration?
|
||
(If new clusters are formed in the process, named their leaders as c3, c4, ... based on the
|
||
order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.)
|
||
Data
|
||
point
|
||
Distance
|
||
between the
|
||
data point and c1
|
||
Distance
|
||
between the
|
||
data point and c2
|
||
Distance
|
||
between the
|
||
data point and c3
|
||
(if needed)
|
||
Distance
|
||
between the
|
||
data point and c4
|
||
(if needed)
|
||
Closest
|
||
Centroid
|
||
The leaders after the first assignment iteration:
|
||
(c)
|
||
[2 points] Which of the two methods, K-Means Clustering or Leader Clustering, will be
|
||
better at dealing with outliers? Please briefly explain.
|
||
(d)
|
||
[2 points] During lectures, we have learned that one drawback of the K-Means Cluster-
|
||
ing algorithm is that we need to specify K for the algorithm, but usually, we don’t know
|
||
how many clusters there should be for an unlabeled dataset. Will the Leader Clustering
|
||
mitigate this drawback? Will there be any related limitations of the Leader Clustering
|
||
algorithm? Please briefly explain.', 16, 21, NULL::jsonb, NULL, NULL, 'Problem 6 [16 points] Leader Clustering
|
||
Consider the following cluster method called Leader Clustering. It receives two parameters:
|
||
an integer K and a real-value threshold T. Similar to K-means clustering, it starts by selecting
|
||
K instance (which will be called leaders) and assigns each training instance to the cluster of
|
||
the closest leader. During this assignment step, however, if the distance of a training instance
|
||
to its closest leader is greater than the input threshold T, then this training instance forms a
|
||
new cluster and becomes the initial leader of this new cluster. After all the training instances
|
||
have been assigned to a cluster, the leader of each cluster is updated as the mean of the
|
||
cluster. The process is then repeated until the cluster assignments do not change.
|
||
(a)
|
||
[6 points] Given a 1-dimensional data set { 1, 3, 5, 9, 11, 13, 15 }, use the Leader
|
||
Clustering algorithm and Euclidean distance to cluster the given points in the data set
|
||
into 2 clusters. Assume c1 = 3 and c2 = 11 are chosen as the initial K = 2 leaders
|
||
and the threshold for forming new clusters T = 5. Fill in the following table of the first
|
||
assignment iteration with your completed values and what are the leaders after the first
|
||
assignment iteration? (If new clusters are formed in the process, named their leaders as
|
||
c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s)
|
||
are not formed yet.)
|
||
Data
|
||
point
|
||
Distance
|
||
between the
|
||
data point and c1
|
||
Distance
|
||
between the
|
||
data point and c2
|
||
Distance
|
||
between the
|
||
data point and c3
|
||
(if needed)
|
||
Distance
|
||
between the
|
||
data point and c4
|
||
(if needed)
|
||
Closest
|
||
Centroid
|
||
The leaders after the first assignment iteration:
|
||
Answer:
|
||
Data
|
||
point
|
||
Distance
|
||
between the
|
||
data point and c1
|
||
Distance
|
||
between the
|
||
data point and c2
|
||
Distance
|
||
between the
|
||
data point and c3
|
||
(if needed)
|
||
Distance
|
||
between the
|
||
data point and c4
|
||
(if needed)
|
||
Closest
|
||
Centroid
|
||
c1
|
||
c1
|
||
c1
|
||
c2
|
||
c2
|
||
c2
|
||
c2
|
||
After the first assignment iteration, c1 = (1+3+5)/3 = 3, c2 = (9+11+13+15)/4 = 12.
|
||
Scheme:
|
||
0.25 point for each correct numeric value (or label). 5.25 points in total.
|
||
0.25 point for the correct values of the leaders.
|
||
If the values of both leader are correct, 0.75. If only one leader is correct,
|
||
0.25.
|
||
(b) [6 points] Given a 1-dimensional data set {5, 9, 11, 13, 17, 19}, use the Leader Clustering
|
||
algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters.
|
||
Assume c1 = 5 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for
|
||
forming new clusters T = 5. Fill in the following table of the first assignment iteration
|
||
with your computed values and what are the leaders after the first assignment iteration?
|
||
(If new clusters are formed in the process, named their leaders as c3, c4, ... based on the
|
||
order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.)
|
||
Data
|
||
point
|
||
Distance
|
||
between the
|
||
data point and c1
|
||
Distance
|
||
between the
|
||
data point and c2
|
||
Distance
|
||
between the
|
||
data point and c3
|
||
(if needed)
|
||
Distance
|
||
between the
|
||
data point and c4
|
||
(if needed)
|
||
Closest
|
||
Centroid
|
||
The leaders after the first assignment iteration:
|
||
Answer:
|
||
Data
|
||
point
|
||
Distance
|
||
between the
|
||
data point and c1
|
||
Distance
|
||
between the
|
||
data point and c2
|
||
Distance
|
||
between the
|
||
data point and c3
|
||
(if needed)
|
||
Distance
|
||
between the
|
||
data point and c4
|
||
(if needed)
|
||
Closest
|
||
Centroid
|
||
c1
|
||
c2
|
||
c2
|
||
c2
|
||
c3
|
||
c3
|
||
After the first assignment iteration, c1 = 5, c2 = (9+11+13)/3 = 11, c3 = (17+19)/2 =
|
||
18.
|
||
Scheme:
|
||
0.25 point for each correct numeric value (or label). 5 points in total.
|
||
1 point for the correct values of the leaders. 1 point in total.
|
||
If there are any incorrect leaders in the answer, 0 point
|
||
(c)
|
||
[2 points] Which of the two methods, K-Means Clustering or Leader Clustering, will be
|
||
better at dealing with outliers? Please briefly explain.
|
||
Answer:
|
||
Leader clustering is more robust (better at dealing with) outliers. This is because new
|
||
cluster will be generated and outliers will be assigned to the new cluster without influ-
|
||
encing the other clusters.
|
||
Scheme:
|
||
1 point for stating which method is better at dealing with outliers.
|
||
1 point for the explanation.
|
||
(d)
|
||
[2 points] During lectures, we have learned that one drawback of the K-Means Cluster-
|
||
ing algorithm is that we need to specify K for the algorithm, but usually, we don’t know
|
||
how many clusters there should be for an unlabeled dataset. Will the Leader Clustering
|
||
mitigate this drawback? Will there be any related limitations of the Leader Clustering
|
||
algorithm? Please briefly explain.
|
||
Answer:
|
||
The Leader Clustering can mitigate this drawback of specifying K because the algorithm
|
||
may get increment on the number of clusters during training. But now the limitation is
|
||
that we have to specify T for the Leader Clustering algorithm. The threshold value will
|
||
influence whether the algorithm increases new leader (new cluster) or not. The clustering
|
||
results will be sensitive to T.
|
||
Scheme:
|
||
1 point for stating whether the Leader Clustering mitigate the drawback.
|
||
1 point for the explanation.
|
||
Explanations including:
|
||
1. More computation/complexity
|
||
2. Sensitive to the initial choice of leaders
|
||
3. Cannot control the number of clusters
|
||
will not get any point.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2024-spring-midterm', '7', NULL, 7, 'long_question', 'coding', 'Problem 7 Continued:
|
||
-------------------- END OF PAPER
|
||
--------------------
|
||
/* Rough work */
|
||
/* Rough work */
|
||
/* Rough work */
|
||
/* Rough work */', 10, 24, NULL::jsonb, NULL, NULL, 'Problem 7 Continued:
|
||
Answer:
|
||
D = 2, average accuracy = 0%: In this case, the dataset was divided into two folds. Since
|
||
the samples from each class were grouped together, one fold contained samples from classes
|
||
0 and 1, and the other fold contained samples from classes 2 and 3. As a result, the classifier
|
||
failed to correctly classify any of the samples because it was not trained on the classes present
|
||
in the test fold.
|
||
D = 3, average accuracy = 50%: In this case, the dataset was divided into three folds.
|
||
Fold 1 contained samples from classes 0 and 1, fold 2 contained samples from classes 1 and 2,
|
||
and fold 3 contained samples from classes 2 and 3. Using fold 1 and fold 2 as training, and
|
||
fold 3 as testing, only 25% of test points are classified correctly.
|
||
Training: Fold 1, 2, Testing: Fold 3, Accuracy = 25%
|
||
Training: Fold 1, 3, Testing: Fold 2, Accuracy = 100%
|
||
Training: Fold 2, 3, Testing: Fold 1, Accuracy = 25%
|
||
So, the average accuracy is (25% + 100% + 25%)/3 = 50%.
|
||
D = 5, accuracy = 100%: In this case, the dataset was divided into five folds.
|
||
Fold 1
|
||
contained samples from class 0, fold 2 contained samples from class 0 and 1, fold 3 contained
|
||
samples from class 1 and 2, fold 4 contained samples from class 2 and 3, and fold 5 contained
|
||
samples from class 3.
|
||
Training: Fold 1, 2, 3, 4, Testing: Fold 5, Accuracy: 100%
|
||
Training: Fold 1, 2, 3, 5, Testing: Fold 4, Accuracy: 100%
|
||
Training: Fold 1, 2, 4, 5, Testing: Fold 3, Accuracy: 100%
|
||
Training: Fold 1, 3, 4, 5, Testing: Fold 2, Accuracy: 100%
|
||
Training: Fold 2, 3, 4, 5, Testing: Fold 1, Accuracy: 100%
|
||
So, the average accuracy is (100% + 100% + 100% + 100% + 100%)/5 = 100%.
|
||
To improve the implementation of D-fold cross-validation, the dataset should be shuffled
|
||
before dividing it into folds. This will ensure that each fold contains a representative dis-
|
||
tribution of samples from all classes, reducing bias and providing more reliable evaluation
|
||
results.
|
||
Scheme:
|
||
2 points for explaining the result obtained for D = 2.
|
||
3 points for explaining the result obtained for D = 3.
|
||
3 points for explaining the result obtained for D = 5.
|
||
2 points for suggesting improvement(s) to the implementation.
|
||
-------------------- END OF PAPER
|
||
--------------------
|
||
/* Rough work */
|
||
/* Rough work */
|
||
/* Rough work */
|
||
/* Rough work */', ARRAY['Evaluation and Validation']::TEXT[], 'Evaluation and Validation', 'Evaluation and Validation', ARRAY['Evaluation and Validation']::TEXT[], ARRAY['metric_computation', 'experimental_design', 'reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table in the answer book. You get 1 point for each correct answer.
|
||
(a) The console output of the following Python code is [1,2,3,4,5,6,7].
|
||
import numpy as np
|
||
# arange(start, stop): Values are generated within half-open interval [start,stop).
|
||
array = np.arange(1,10)
|
||
print(array[:-2])
|
||
(b) There is no way for the Na¨ıve Bayes classifier to make a prediction if a categorical feature
|
||
(e.g., color) has a new category (e.g., blue) not observed in the training data set.
|
||
(c) K-Nearest Neighbors classifier is a non-parametric machine learning algorithm with an
|
||
assumption that the data are uniformly distributed.
|
||
(d) It is possible for K-Means Clustering to return empty clusters if certain initial centroid
|
||
positions are unfortunate.
|
||
(e) If there are only two classes to predict, the following Multi-layer Perceptron (MLP)
|
||
models will have the same output, given that they have the same initial weights and
|
||
biases, and are trained in the same manner:
|
||
MLP 1: Input layer, hidden dense layer with ReLU activation function, output layer
|
||
with sigmoid activation function.
|
||
MLP 2: Input layer, hidden dense layer with ReLU activation function, output layer
|
||
with softmax activation function.
|
||
(f) An affine transformation may preserve distances and angles.
|
||
(g) The number of floating point multiplications involved when a 32×32 pixel RGB image
|
||
is passed through a 2D convolutional layer with 8 3×3 kernels, padding of 3, and stride
|
||
length of 1 is 8×3×3×36×36.
|
||
(h) Setting the dropout rate of a Convolutional Neural Network to 0.5 means that more than
|
||
50% of its layer’s outputs are non-zero.
|
||
(i) Alpha-beta pruning can sometimes change the final decision made by the minimax algo-
|
||
rithm, resulting in a different move being selected for the current player.
|
||
(j) Researches that involves human participants should require informed consent.', 10, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions
|
||
Indicate whether the following statements are true or false by putting T or F in the given
|
||
table. You get 1 point for each correct answer.
|
||
Question
|
||
(a)
|
||
(b)
|
||
(c)
|
||
(d)
|
||
(e)
|
||
(f)
|
||
(g)
|
||
(h)
|
||
(i)
|
||
(j)
|
||
Answer
|
||
T
|
||
F
|
||
F
|
||
T
|
||
F
|
||
T
|
||
F
|
||
F
|
||
F
|
||
T
|
||
Scheme:
|
||
1 point for giving each correct answer. 10 points in total.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [10 points] Advanced Python: Image Processing with NumPy
|
||
You are working on an image processing project that involves manipulating arrays to perform
|
||
various operations on images. You have been provided with a NumPy array img representing
|
||
a grayscale image of size 512 × 512:
|
||
import numpy as np
|
||
img = np.random.randint(0, 256, size=(512, 512), dtype=np.uint8)
|
||
# Note: random.randint(low, high=None, size=None, dtype=int)
|
||
# returns random integers from low (inclusive) to high (exclusive).
|
||
(a)
|
||
[4 points] Image masking:
|
||
Implement the following function which creates a new array img masked by applying a
|
||
mask to img. The mask is defined as a 2D array mask of size 512 × 512, where mask[i,
|
||
j] = 1 if the pixel at img[i, j] is within a circle (boundary included) of radius 100
|
||
centered at a given position [center x, center y], and mask[i, j] = 0 otherwise.
|
||
def apply_circle_mask(img, center_x, center_y):
|
||
# --- YOUR CODE HERE ---
|
||
return img_masked
|
||
Below is an example for center x = 100 and center y = 200.
|
||
img
|
||
img_masked
|
||
Do NOT use any explicit loop to implement the function. You may find the following
|
||
functions useful for this question.
|
||
np.arange(start, stop, step) returns spaced values within a given interval.
|
||
– start (optional) is the start of interval. The default start value is 0.
|
||
– stop is the end of interval. The returned interval does not include this value.
|
||
– step (optional) is the spacing between values. The default spacing is 1.
|
||
np.square(a) returns the element-wise square of an array.
|
||
– a is the input array.
|
||
This is equivalent to a ** 2.
|
||
np.sqrt(a) returns the element-wise square root of an array.
|
||
– a is the input array.
|
||
np.expand dims(a, axis) returns a new array with a new axis of size 1 inserted.
|
||
– a is the input array.
|
||
– axis is the position where the axis is to be inserted.
|
||
If axis is 0, this is equivalent to a[np.newaxis] and a[None]. If axis is 1, this is
|
||
equivalent to a[:, np.newaxis] and a[:, None].
|
||
(b)
|
||
[6 points] Image blurring:
|
||
Your task is to use NumPy to apply the following 3 × 3 blur filter to img with zero
|
||
padding so that img blur has the same shape as img.
|
||
blur_filter = np.array([[1/9, 1/9, 1/9],
|
||
[1/9, 1/9, 1/9],
|
||
[1/9, 1/9, 1/9]])
|
||
Implement the following function so that after the whole code snippet below is executed,
|
||
img blur stores the desired result.
|
||
def img_flatten_conv_1d(img, v):
|
||
# --- YOUR CODE HERE ---
|
||
return img_conv
|
||
v = blur_filter.sum(0) # ''sum(0)'' returns the sum of the array elements over axis 0.
|
||
img_blur = img_flatten_conv_1d(img, v)
|
||
img_blur = img_flatten_conv_1d(img_blur.T, v).T
|
||
Do NOT use any explicit loop in your code. You may find the following functions useful
|
||
for this question.
|
||
np.convolve(a, v, mode=''full'') returns the discrete, linear convolution of two
|
||
one-dimensional sequences.
|
||
– a of shape (N, ): First one-dimensional input array (or array-like structure, e.g.,
|
||
list).
|
||
– v of shape (M, ): Second one-dimensional input array (or array-like structure,
|
||
e.g., list).
|
||
– mode (optional): The convolution mode. It must be one of ''full'', ''valid'',
|
||
''same''. Note: use ''valid'' for this question. When ''valid'' is used, np.convolve
|
||
is equivalent to the following function:
|
||
def convolve_valid(a, v):
|
||
if len(a) < len(v):
|
||
a, v = v, a
|
||
# swap the array if v is longer than a
|
||
c = np.zeros(len(a) - len(v) + 1)
|
||
for i in range(len(c)):
|
||
c[i] = np.sum(a[i:i+len(v)] * v[::-1])
|
||
return c
|
||
Examples:
|
||
>>> np.convolve([1,2,3],[0,1,0.5], ''valid'')
|
||
array([2.5])
|
||
# this is the output array
|
||
>>> np.convolve([1,2,3,4],[0,1,0.5], ''valid'')
|
||
array([2.5, 3.5])
|
||
# this is the output array
|
||
np.zeros(shape, dtype=float) returns a new array of given shape and type, filled
|
||
with zeros.
|
||
– shape is the shape of the new array, e.g., (2, 3) or 2.
|
||
– dtype (optional) is the desired data type for the array, e.g., np.int8. The default
|
||
is np.float64.
|
||
np.concatenate((a1, a2, ...), axis=0) joins a sequence of arrays along an ex-
|
||
isting axis.
|
||
– a1, a2, ... is a sequence of arrays (or array-like structure, e.g., list). The
|
||
arrays must have the same shape, except in the dimension corresponding to
|
||
axis (the first, by default).
|
||
– axis (optional) is the axis along which the arrays will be joined. If axis is None,
|
||
arrays are flattened before use. The default is 0.
|
||
np.reshape(a, newshape) gives a new shape to an array without changing its data.
|
||
– a is the array to be reshaped.
|
||
– newshape is the new shape. It should be compatible with the original shape. If
|
||
an integer, then the result will be a 1D array of that length. One shape dimension
|
||
can be −1. In this case, the value is inferred from the length of the array and
|
||
the remaining dimensions (if any).
|
||
This is equivalent to a.reshape(newshape).
|
||
np.transpose(a) returns the transpose of an array.
|
||
– a is the input array.
|
||
This is equivalent to a.T.
|
||
Hints:
|
||
(1) In this case, applying blur filter to the image can also be done by consecutively
|
||
applying two 1D filters, one vertically and the other horizontally, to the image.
|
||
(2) Since np.convolve only accepts 1D arrays, you may consider flattening the image
|
||
array, applying np.convolve to the flattened array, and then reshaping it back to a
|
||
2D array.
|
||
(3) Be aware of the boundaries since np.convolve with mode=''valid'' does not pad
|
||
the array and the output array does not always have the same shape as the input
|
||
array. Also, remember to remove the padding (if any) after convolutions.', 10, 3, NULL::jsonb, NULL, NULL, 'Problem 2 [10 points] Advanced Python: Image Processing with NumPy
|
||
Solution:
|
||
(a) def apply_circle_mask(img, center_x, center_y):
|
||
indices = np.arange(512)
|
||
x_dist = indices - center_x
|
||
y_dist = indices - center_y
|
||
dist = np.sqrt(x_dist ** 2 + y_dist[:, None] ** 2)
|
||
img_masked = img * (dist <= 100)
|
||
return img_masked
|
||
(b) def img_flatten_conv_1d(img, v):
|
||
zeros = np.zeros((512, 1))
|
||
img_padded = np.concatenate((zeros, img, zeros), axis=1)
|
||
img_flat = img_padded.reshape(-1)
|
||
img_flat_conv = np.convolve(img_flat, v, ''valid'')
|
||
img_flat_conv = np.concatenate(([0], img_flat_conv, [0]))
|
||
img_conv = img_flat_conv.reshape(512, 514)[:, 1:-1]
|
||
return img_conv
|
||
v = [1/3, 1/3, 1/3]
|
||
img_blur = img_flatten_conv_1d(img, v)
|
||
img_blur = img_flatten_conv_1d(img_blur.T, v).T
|
||
Scheme:
|
||
(a) If no explicit loop is used, 1 point for each of the following:
|
||
correct broadcasting to create 2D distance array;
|
||
correct distance values;
|
||
correct mask (0.5 point for the opposite mask);
|
||
correct final result.
|
||
If explicit loops are used, 1 point in total if the final result is correct.
|
||
(b) If no explicit loop is used, 1 point for each of the following:
|
||
correct padding before flattening the image array (0.5 point if padding is applied
|
||
after flattening);
|
||
correct flattening of the image array;
|
||
correct dimensions of inputs (1D arrays) to np.convolve;
|
||
correct values of inputs to np.convolve;
|
||
correct padding after np.convolve;
|
||
correct reshaping back to 2D image and removing padding (0.5 each).
|
||
If explicit loops are used, 1 point in total if the final result is correct.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [12 points] Na¨ıve Bayes, K-Nearest Neighbors and Perceptron
|
||
Below is some health data of patients with and without dengue fever.
|
||
Patient #
|
||
Diagnosis (B)
|
||
Body Temperature
|
||
(Celsius) (e1)
|
||
Pulse Rate
|
||
(bpm) (e2)
|
||
Dengue
|
||
Dengue
|
||
Dengue
|
||
No dengue
|
||
No dengue
|
||
36.5
|
||
No dengue
|
||
You can assume the data follows Gaussian distribution:
|
||
f(x) =
|
||
√
|
||
2πσ2 exp(−(x −µ)2
|
||
2σ2
|
||
)
|
||
where
|
||
µ = 1
|
||
n
|
||
n
|
||
X
|
||
i=1
|
||
xi
|
||
σ =
|
||
v
|
||
u
|
||
u
|
||
t
|
||
n −1
|
||
n
|
||
X
|
||
i=1
|
||
(xi −µ)2
|
||
(a)
|
||
[6 points] Using the data above, calculate the following. Round off calculations to the
|
||
fourth decimal place.
|
||
(i) P(e1 = 36 | B = No dengue)
|
||
(ii) P(e2 = 85 | B = No dengue)
|
||
(iii) P(e1 = 36 | B = No dengue) P(e2 = 85 | B = No dengue) P(B = No dengue)
|
||
(iv) Calculate the posterior probability that the belief is ‘No dengue’ for the sample with
|
||
evidence E = {e1 = 36, e2 = 85} if
|
||
P((e1 = 36, e2 = 85)|B = Dengue)P(B = Dengue) = 0.0000036
|
||
(b)
|
||
[4.5 points] Identify the reasons for inaccurate predictions when using the following
|
||
number of folds to evaluate the performance of a 3-Nearest Neighbors classifier model on
|
||
the given dataset (i.e., not including the sample introduced in part (a)) without shuffling.
|
||
(i) No. of folds = 6
|
||
(ii) No. of folds = 3
|
||
(iii) No. of folds = 2
|
||
(c)
|
||
[1.5 points] If the true label of the sample {e1 = 36, e2 = 85} is ‘No dengue’, will the
|
||
perceptron model make a good prediction for the sample? Provide explanations with
|
||
evidence for why or why not.', 12, 6, NULL::jsonb, NULL, NULL, 'Problem 3 [12 points] Na¨ıve Bayes, K-Nearest Neighbors and Perceptron
|
||
Solution:
|
||
(a)
|
||
(i)
|
||
p
|
||
2π(0.5)2 exp
|
||
|
||
−(36 −36.5)2
|
||
2(0.52)
|
||
|
||
= 0.4839
|
||
(ii)
|
||
p
|
||
2π(15)2 exp
|
||
|
||
−(85 −90)2
|
||
2(152)
|
||
|
||
= 0.0252
|
||
(iii)
|
||
(0.4839)(0.0252)(0.5) = 0.0061
|
||
(iv)
|
||
0.0061
|
||
0.0061 + 0.0000036 = 0.9994
|
||
(b)
|
||
(i) This will predict 5/6 correctly if uniform distance (prediction for Patient 5 is wrong).
|
||
Also, if inverse distance is used, the prediction for Patient 3 will also be wrong.
|
||
(ii) It is possible that the data will be folded in such a way that the class opposite to
|
||
the true label is typically the majority class e.g.
|
||
(Patient 1, Patient 2), (Patient 3,
|
||
Patient 4), (Patient 5, Patient 6) . In this case, most folds will have 50% accuracy.
|
||
The performance may be even worse as there is one pair of samples from opposite
|
||
classes with minimum pairwise Euclidean distance (Patient 3 and Patient 5).
|
||
(iii) This is not acceptable because it is possible that the training set for the KNN model
|
||
will have the class opposite to the true label as its majority class. In this case, the
|
||
prediction for all samples will be wrong.
|
||
(c) Yes, because in that case the data is linearly separable.
|
||
Scheme:
|
||
(a)
|
||
(i) 1.5 points for giving the correct answer.
|
||
(ii) 1.5 points for giving the correct answer.
|
||
(iii) 1.5 points for giving the correct answer.
|
||
(iv) 1.5 points for giving the correct answer.
|
||
(b)
|
||
(i) 1.5 point for giving the correct reason.
|
||
(ii) 1.5 point for giving the correct reason.
|
||
(iii) 1.5 point for giving the correct reason.
|
||
(c) 0.5 point for stating “Yes”, i.e., perceptron model will make a good prediction for the
|
||
sample. 1 point for giving the correct explanation.', ARRAY['Probabilistic Models', 'KNN and Clustering', 'Perceptron and MLP']::TEXT[], 'Probabilistic Models', NULL, ARRAY['Probabilistic Models', 'KNN and Clustering', 'Perceptron and MLP']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision', 'distance_calculation', 'algorithm_tracing', 'forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [11 points] Multi-layer Perceptron
|
||
(a)
|
||
[5 points] Amanda wants to train a multi-layer perceptron to classify various renting
|
||
options’ popularity.
|
||
Each housing option is represented with X, a three-dimensional
|
||
vector (x1, x2, x3).
|
||
x1 is the noise level rated from 1 to 5. x2 is the proximity to campus, measured in minutes
|
||
it takes to arrive at the north gate. x3 is the availability of food options rated on a scale
|
||
from 1 (no food available) to 5 (a wide variety of food available). Amanda has collected
|
||
the opinions of a group of students on their preferences, classifying the housing options
|
||
into three types: low, medium, and high, represented as one-hot vector Y = (y1, y2, y3).
|
||
When designing the network architecture, Amanda has two ideas.
|
||
Model A
|
||
Model B
|
||
Layer (type)
|
||
Output Shape
|
||
Layer (type)
|
||
Output Shape
|
||
dense 1 (Dense)
|
||
(None, 6)
|
||
dense 1 (Dense)
|
||
(None, 3)
|
||
dense 2 (Dense)
|
||
(None, 4)
|
||
dense 2 (Dense)
|
||
(None, 5)
|
||
dense 3 (Dense)
|
||
(None, 3)
|
||
dense 3 (Dense)
|
||
(None, 2)
|
||
dense 4 (Dense)
|
||
(None, 3)
|
||
(i) In a multi-layer perceptron, suppose there are L hidden layers, each layer k (starting
|
||
from 1) has lk hidden nodes. The input data is a n-dimensional vector, and the
|
||
output data is m-dimensional.
|
||
(I) Calculate the number of updated parameters in the MLP model. Represent your
|
||
result using n, m, L, and lk for k = 1, . . . , L.
|
||
(II) Apply your result in part (a)(i)(I) to Model A and Model B separately.
|
||
(ii) Please help Amanda decide on which multi-layer perceptron (A or B) to choose for
|
||
potentially better performance. Please briefly explain why.
|
||
(b) [1 point] Why do we use activation functions in multi-layer perceptron neural networks?
|
||
(c)
|
||
[2 points] Consider the two activation functions we have learned in class:
|
||
Sigmoid activation function: σ(x) =
|
||
1+e−x .
|
||
Binary step activation function:
|
||
f(x) =
|
||
|
||
|
||
|
||
if x ≤0
|
||
otherwise
|
||
The binary step activation function gives a hard threshold at 0, and its gradients are 0
|
||
almost everywhere. What is the problem if we use the binary step activation function in
|
||
a multi-layer perceptron network? If we want to avoid the problem while approximating
|
||
the binary step activation function, how to make use of the sigmoid activation function
|
||
to achieve that?
|
||
(d)
|
||
[3 points] You are given an multi-layer perceptron model with the architecture shown
|
||
below.
|
||
ReLU: f(z) = max(0, z).
|
||
Softmax: f(zi) =
|
||
ezi
|
||
Pn−1
|
||
j=0 ezj , where z = [z0, z1, . . . , zn−1].
|
||
(i) For a sample with features x1=1 and x2=1, what are the outputs of the hidden layer
|
||
and the output layer? If necessary, round off the values to two decimal places.
|
||
(ii) If the target labels have values Tk1=1,Tk2=0 for the sample in part d(i), calculate the
|
||
new values of the weights: w5, w7, and w1 after one round of backward propagation
|
||
if the learning rate is 0.4. Round off the values to four decimal places.
|
||
For reference, here are some of the equations used in the back propagation.
|
||
δk = (Ok −Tk)Ok(1 −Ok)
|
||
δj = Oj(1 −Oj)
|
||
X
|
||
k∈K
|
||
δkwjk
|
||
wjk ←wjk −ηδkOj
|
||
wij ←wij −ηδjOi', 11, 7, NULL::jsonb, NULL, NULL, 'Problem 4 [11 points] Multi-layer Perceptron
|
||
Solution:
|
||
(a)
|
||
(i) (I) For any hidden layer and the output layer, the parameters include the weight
|
||
and the bias.
|
||
(n + 1) × l1 +
|
||
L−1
|
||
X
|
||
k=1
|
||
lk+1 × (lk + 1) + m × (lL + 1)
|
||
(II) Model A: 67; Model B: 53.
|
||
(ii) Model A is better because it has more parameters and therefore more expressive.
|
||
(b) To add non-linearity to the neural network model so that it has more powerful modeling
|
||
capability.
|
||
(c) The problem is that we cannot learn the parameters using gradient descent since the
|
||
gradients are 0 almost everywhere. We can solve the problem while approximating a
|
||
hard threshold by scaling up the weights in a sigmoid activation function. For example,
|
||
σ(cx) is steeper than σ(x) and more similar with the binary step function, for c > 1.
|
||
(d)
|
||
(i) Hidden layer Oj1 = 0.2, Oj2 = 0
|
||
Output layer Ok1 = 0.48, Ok2 = 0.52
|
||
(ii) w
|
||
′
|
||
5 = w5 −δk1ηOj1 = 0.1 −(−1298)(0.4)(0.2) = 0.1104
|
||
w
|
||
′
|
||
7 = w7 −δk2ηOj1 = 0 −(1298)(0.4)(0.2) = −0.0104
|
||
δj1 = Oj1(1−Oj1)(δk1w5+δk2w7) = 0.2(1−0.2)(−0.1298(0.1)+0.1298(0)) = −0.0021
|
||
w
|
||
′
|
||
1 = w1 −δj1ηx1 = 0.2 −(0.4)(−0.0021)(1) = 0.2008
|
||
Scheme:
|
||
(a)
|
||
(i) (I) 1.5 points for giving the correct formula.
|
||
(II) 1 point for each correct answer. 2 points in total.
|
||
(ii) 0.5 point for stating Model A is better. 1 point for giving the correct explanation.
|
||
1.5 points in total.
|
||
(b) 1 point for giving the correct answer for why we use activation functions in multi-layer
|
||
perceptron.
|
||
(c) 1 point for stating the problem. 1 point for explaining how to make use of the sigmoid
|
||
function to avoid the problem. 2 points in total.
|
||
(d)
|
||
(i) 0.5 point for giving each correct output. 1.5 points in total.
|
||
(ii) 0.5 point for giving each correct weight value. 1.5 points in total.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [13 points] Digital Image Processing
|
||
(a) [4.5 points] The following is a grayscale image of the HKUST redbird and its correspond-
|
||
ing histogram. An grayscale image histogram is a distribution showing the frequency of
|
||
occurrence of each gray-level value.
|
||
After some transformation on the original image, we get the transformed images shown
|
||
below (first row). Their histograms are shuffled and shown in the rows below (second,
|
||
third, and fourth row). Please state which transformations are applied to get the resulting
|
||
images for (a),(b),(c), and what is the correct pairing for the images and histograms
|
||
between {a,b,c} and {d,e,f}. Please also briefly state why.
|
||
(b)
|
||
[3 points] Consider the following 3 × 3 image.
|
||
Perform binary thresholding of the
|
||
image using Otsu’s method. The initial threshold is T=100, and we apply one iteration.
|
||
What is the resulting threshold and the resulting image after thresholding? What is the
|
||
advantage of using Otsu’s Method for image thresholding compared to the regular image
|
||
thresholding algorithm?
|
||
(c)
|
||
[2 points] What is the resulting image of size 7 × 7 after adding reflection padding of
|
||
size 2 on the original image in part (b)?
|
||
(d)
|
||
[1.5 points] Please briefly explain the effect of convolving an image with the following
|
||
kernels.
|
||
(i) Kernel 1:
|
||
1/9
|
||
1/9
|
||
1/9
|
||
1/9
|
||
1/9
|
||
1/9
|
||
1/9
|
||
1/9
|
||
1/9
|
||
(ii) Kernel 2:
|
||
-1
|
||
-1
|
||
-1
|
||
(iii) Kernel 3:
|
||
-1
|
||
-1
|
||
-1
|
||
-1
|
||
-1
|
||
-1
|
||
-1
|
||
-1
|
||
(e)
|
||
[2 points] Is it possible to design a 3 × 3 kernel and apply convolution with the kernel
|
||
to flip a 64Ö64 image horizontally or vertically? If yes, please give such a kernel. If not,
|
||
please explain why.', 13, 9, NULL::jsonb, NULL, NULL, 'Problem 5 [13 points] Digital Image Processing
|
||
Solution:
|
||
(a) (b)-(f): image (horizontal) flipping because the histogram is the same as the original
|
||
image.
|
||
(c)-(d): binary thresholding because there are only two values 0 and 255 in the histogram
|
||
(a)-(e): contrast stretching because the intensity value in the histogram has been stretched
|
||
to a wider range.
|
||
(b) After the first iteration µ1=21 , µ2=128, T = 74.5 The resulting images are
|
||
Compared to regular image thresholding algorithms, Otsu’s method has advantages (i)
|
||
can automatically determine the threshold value T (ii) the resulting threshold value is
|
||
reproducible. Given the same image, two researchers using Otsu’s algorithm must arrive
|
||
at the same threshold.
|
||
(c) Resulting image:
|
||
(d) (i) smoothing, (ii) vertical edge detection, (iii) sharpening.
|
||
(e) No, because the image flipping is a global operation on the image, but convolution with
|
||
a 3 × 3 kernel is a local operation. Concretely, the 3 × 3 kernels can only capture the
|
||
input value of 3 × 3 neighbors, but the flipping requires the pixel value information at a
|
||
longer distance. The longest dependency distance can be 64.
|
||
Scheme:
|
||
(a) 1 for stating each transformation correctly. 3 points in total. 0.5 point for giving each
|
||
correct pairing. 1.5 points in total.
|
||
(b) 1 point for giving the correct resulting threshold. 1 point for giving the correct result
|
||
image. 1 point for stating the advantage of using Otsu’s method. 3 points in total.
|
||
(c) 0.05 for giving each correct value (40 values). 2 points in total.
|
||
(d) 0.5 point for stating each effect of convolving with the given kernel correctly. 1.5 points
|
||
in total.
|
||
(e) 0.5 point for stating it is impossible to design a 3×3 kernel and apply it to flip the 64×64
|
||
image. 1.5 points for giving the explanation.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [13 points] Dilated Convolution and Dropout
|
||
(a)
|
||
[10 points] Dilated convolution is a variation of the standard convolution operation that
|
||
involves skipping input elements by a certain dilation rate. By doing this, the convo-
|
||
lutional kernel can be made to “see” a larger area of the input data without actually
|
||
increasing the number of parameters it has. The dilation rate determines the spacing
|
||
between the values in the kernel.
|
||
The figure below demonstrates how a 3×3 kernel is applied to a 7×7 image using a
|
||
dilation factor of 2.
|
||
Given the incomplete implementation of the Python function dilated convolution that
|
||
takes the following inputs:
|
||
input array: a 2D NumPy array representing the input data
|
||
kernel: a 2D NumPy array representing the convolutional kernel
|
||
dilation rate: an integer representing the dilation rate (default value is 1)
|
||
stride: an integer representing the stride (default value is 1)
|
||
padding: a string representing the padding type, either ‘valid’ (no padding) or ‘same’
|
||
(padding to preserve input dimensions) (default value is ‘valid’)
|
||
and returns a 2D NumPy array representing the output of the dilated convolution oper-
|
||
ation.
|
||
Complete the missing parts of the function using NumPy, without using any special-
|
||
ized deep learning libraries so that the execution of the test script produces the required
|
||
output. Make sure that your implementation supports stride and padding options.
|
||
You may find the following formula for determining the size of output image of reg-
|
||
ular image convolution useful for this question.
|
||
(Size of image dimension - Size of kernel dimension + 2 × Padding) / Stride + 1
|
||
import numpy as np
|
||
# Zero pad the input_array. For example
|
||
# a = [[1,2,3]]
|
||
# np.pad(a, ((1,2),(3,4)), ''constant'')
|
||
# >> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||
# >>
|
||
[0, 0, 0, 1, 2, 3, 0, 0, 0, 0],
|
||
# >>
|
||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||
# >>
|
||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
|
||
def dilated_convolution(input_array, kernel, dilation_rate=1,
|
||
stride=1, padding=''valid''):
|
||
# Apply padding to the input array if specified
|
||
if padding == ''same'':
|
||
pad = #______ TODO 1 ______
|
||
padded_input = np.pad(input_array, ((pad, pad), (pad, pad)), mode=''constant'')
|
||
else:
|
||
padded_input = input_array
|
||
# Calculate the output shape
|
||
output_rows = #______ TODO 2 ______
|
||
output_cols = #______ TODO 3 ______
|
||
kernel_rows, kernel_cols = kernel.shape
|
||
# Initialize the output array with zeros
|
||
output_array = np.zeros((output_rows, output_cols))
|
||
# Iterate through the kernel and perform the convolution
|
||
for i in range(kernel_rows):
|
||
for j in range(kernel_cols):
|
||
# Calculate the input indices for the current kernel position
|
||
input_row_indices = #______ TODO 4 ______
|
||
input_col_indices = #______ TODO 5 ______
|
||
# Perform the convolution and accumulate the results in the output array
|
||
output_array += #______ TODO 6 ______
|
||
return output_array
|
||
# Test script
|
||
input_array = np.array(np.arange(100).reshape(10,10))
|
||
kernel = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
|
||
dilation_rate = 2; stride = 2; padding = ''same''
|
||
output_array = dilated_convolution(input_array, kernel,
|
||
dilation_rate, stride, padding)
|
||
print(output_array)
|
||
# Output:
|
||
# [[ 22.
|
||
26.
|
||
30.
|
||
34.
|
||
8.]
|
||
#
|
||
[ 62.
|
||
66.
|
||
72.
|
||
78.
|
||
34.]
|
||
#
|
||
[102. 126. 132. 138.
|
||
74.]
|
||
#
|
||
[142. 186. 192. 198. 114.]
|
||
#
|
||
[ 80. 142. 146. 150. 154.]]
|
||
You may find the following functions useful for this question.
|
||
np.arange(start, stop, step) returns spaced values with a given interval.
|
||
– start (optional) is the start of interval. The default start value is 0.
|
||
– stop is the end of interval. The returned interval does not include this value.
|
||
– step (optional) is the spacing between values. The default spacing is 1.
|
||
np.array(object) creates an array.
|
||
– object is an array or any sequence.
|
||
np.reshape(a, newshape) gives a new shape to an array without changing its data.
|
||
– a is the array to be reshaped.
|
||
– newshape is the new shape. It should be compatible with the original shape. If
|
||
an integer, then the result will be a 1D array of that length. One shape dimension
|
||
can be -1. In this case, the value is inferred from the length of the array and the
|
||
remaining dimensions (if any).
|
||
This is equivalent to a.reshape(newshape).
|
||
ndarray.shape returns a tuple of array dimensions.
|
||
ndarray.zeros(shape, dtype=float) returns a new array of given shape and type,
|
||
filled with zeros.
|
||
– shape is the shape of the new array, e.g., (2,3) or 2.
|
||
– dtype(optional) is the desired data type for the array, e.g., np.int8. The default
|
||
is np.float64.
|
||
range(n) returns a numeric series starting with 0 and extending up to but not
|
||
including n.
|
||
(b) [3 points] You are given an incomplete implementation of the Python function dropout,
|
||
which implements the Dropout technique for regularization in neural networks.
|
||
The
|
||
function takes the following input:
|
||
input array: a 2D NumPy array representing the input data
|
||
p: a dropout probability
|
||
The function generates a binary mask with the same shape as the input, where each
|
||
element is 1 with probability 1-p and 0 with probability p. The input is then multiplied
|
||
element-wise by the mask, effectively dropping out random elements. The function re-
|
||
turns a NumPy array representing the output of the dropout operation.
|
||
Complete the missing part of the function using NumPy, without using any special-
|
||
ized deep learning libraries so that the execution of the test script produces the required
|
||
output.
|
||
You may find the following function useful for this question.
|
||
np.random.rand(d0, d1, . . ., dn) returns an array of the given shape and popu-
|
||
late it with random samples from a uniform distribution over [0, 1).
|
||
– d0, d1,. . ., dn (optional) represent the dimensions of the returned array, must
|
||
be non-negative. If no argument is given, a single Python float is returned.
|
||
import numpy as np
|
||
def dropout(input_array, p):
|
||
mask = #______ TODO ______
|
||
return input_array * mask', 13, 12, NULL::jsonb, NULL, NULL, 'Problem 6 [13 points] Dilated Convolution and Dropout
|
||
Solution:
|
||
(a) Dilated Convolution
|
||
TODO #
|
||
Answer
|
||
((kernel.shape[0] - 1) * dilation_rate) // 2
|
||
1.5 points
|
||
(input_array.shape[0] - kernel.shape[0] * dilation_rate + 2 * pad) // stride + 1
|
||
1.5 points
|
||
(input_array.shape[1] - kernel.shape[1] * dilation_rate + 2 * pad) // stride + 1
|
||
1.5 points
|
||
np.arange(0, output_rows * stride, stride) + i * dilation_rate
|
||
1.5 points
|
||
np.arange(0, output_cols * stride, stride) + j * dilation_rate
|
||
1.5 points
|
||
kernel[i, j] * padded_input[np.ix_(input_row_indices, input_col_indices)]
|
||
2.5 points
|
||
(b) Dropout
|
||
(np.random.rand(input_array.shape[0],input_array.shape[1]) < p) / p
|
||
# 3 points', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [18 points] Convolutional Neural Network
|
||
(a)
|
||
[8 points] Consider the following Keras implementation of a Convolutional Neural Net-
|
||
work:
|
||
from keras.models import Sequential
|
||
from keras.layers import Conv2D, MaxPooling2D
|
||
from keras.layers import Dense, Flatten
|
||
model = Sequential()
|
||
model.add(Conv2D(filters=32, kernel_size=(5, 5), padding=''same'',
|
||
activation=''relu'', input_shape=(32, 32, 3)))
|
||
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'',
|
||
activation=''relu''))
|
||
model.add(MaxPooling2D(pool_size=(2, 2)))
|
||
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'',
|
||
activation=''relu'', strides=(2, 2)))
|
||
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'',
|
||
activation=''relu''))
|
||
model.add(MaxPooling2D(pool_size=(2, 2)))
|
||
model.add(Flatten())
|
||
model.add(Dense(units=128, activation=''relu''))
|
||
model.add(Dense(units=10, activation=''softmax''))
|
||
(i) What is the padding size (in terms of border width) of the first convolutional layer?
|
||
(ii) Fill in the blanks in the answer book in the model.summary() of the network. Assume
|
||
all convolutional layers have biases. Numbers shown as ? are kept secret and you
|
||
may ignore them.
|
||
Hint: When padding = “same” and strides = 2, the output has the half size of the
|
||
input.
|
||
(iii) Suppose you observe the following loss curves as you train the Convolutional Neural
|
||
Network. What is the most likely problem with the model? When does the problem
|
||
usually occur? Describe a common way to mitigate the problem and explain why it
|
||
works (in a few sentences).
|
||
(b) [5 points] What is the output of the following parts of this very tiny convolutional neural
|
||
network? This network was trained to classify whether the input 4×4 image depicts the
|
||
letter “L” or not. Stride is 1, padding size is 0 for all layers, and the pool size for the
|
||
max pooling layer is (2, 2). Show your calculation step by step. You only need to keep
|
||
1 decimal number.
|
||
ReLU activation function f(z) = max(0, x).
|
||
Sigmoid activation function f(z) =
|
||
1+e−z , where e = 2.718.
|
||
Note: Assume the given kernels have already been flipped.
|
||
(i) Feature map after Max-Pooling
|
||
(ii) Feature in the fully-connected layer
|
||
(iii) Output
|
||
(c) [5 points] In video processing, 3D convolution is particularly valuable due to its ability to
|
||
capture temporal dynamics as well as spatial features, which is essential for understanding
|
||
content across frames. Unlike 2D convolution, which only analyzes spatial patterns within
|
||
a single frame, 3D convolution extends the analysis across multiple consecutive frames,
|
||
treating time as the third dimension alongside height and width. This approach allows a
|
||
Convolutional Neural Network (CNN) to not only perceive static features in individual
|
||
frames but also to understand movements and changes over time, which are crucial for
|
||
tasks such as action recognition, video classification, and anomaly detection in video
|
||
streams.
|
||
Suppose we have video data of shape (32, 400, 300, 3), corresponding to (#keyframes,
|
||
width, height, #channels). In the first layer, we want to apply a 3D convolution with
|
||
100 kernels in the shape of (3, 3, 3), corresponding to (keyframe, width, height). The
|
||
padding is (0, 2, 2), and the stride is (1, 2, 2). Please answer the following question with
|
||
calculation steps or rationales.
|
||
(i) What is the shape of the output after this layer?
|
||
(ii) How many weight parameters are there?
|
||
(iii) How many bias parameters are there?', 18, 16, NULL::jsonb, NULL, NULL, 'Problem 7 [18 points] Convolutional Neural Network
|
||
Solution:
|
||
(a)
|
||
(i) 2
|
||
(ii) 16, 16, 64
|
||
8,
|
||
8,
|
||
(iii) The model has overfit the training dataset.
|
||
Overfitting occurs when a model is
|
||
too complex and learns the noise in the training data rather than the underlying
|
||
patterns.
|
||
Dropout can be used to mitigate the problem. Dropout is a technique where, during
|
||
training, some neurons in the network are randomly dropped out (i.e., their outputs
|
||
are set to zero) with a certain probability, typically 0.2 or 0.5. This means that,
|
||
at each training iteration, a different subset of neurons is randomly selected to be
|
||
“dropped out.”
|
||
Dropout helps prevent overfitting in several ways (full marks to any of the fol-
|
||
lowing):
|
||
Reducing capacity: By randomly dropping out neurons, the network’s capacity
|
||
is reduced, making it less prone to overfitting. With fewer neurons, the network
|
||
has fewer opportunities to memorize the training data.
|
||
Forcing feature sharing: Dropout encourages feature sharing among neurons.
|
||
When a neuron is dropped out, the network must rely on other neurons to make
|
||
predictions, which promotes feature sharing and reduces overfitting.
|
||
Preventing complex co-adaptations: Dropout breaks the complex co-adaptations
|
||
between neurons, which can lead to overfitting. By randomly dropping out neu-
|
||
rons, the network is forced to learn simpler, more generalizable representations.
|
||
Improving generalization: Dropout can be seen as a form of data augmentation.
|
||
By randomly dropping out neurons, the network is forced to generalize to new,
|
||
unseen situations, which improves its ability to generalize to new data.
|
||
Reducing the risk of over-reliance on a single neuron: Dropout prevents the
|
||
network from relying too heavily on a single neuron or a small group of neurons.
|
||
This reduces the risk of overfitting, as the network is forced to use multiple
|
||
neurons to make predictions.
|
||
Ensemble-like behavior: Dropout can be seen as an ensemble method, where
|
||
multiple sub-networks are trained simultaneously. Each sub-network is a different
|
||
subset of neurons, and the final prediction is an ensemble of these sub-networks.
|
||
This ensemble-like behavior improves generalization and reduces overfitting.
|
||
The answer is not unique.
|
||
(b) Answer:
|
||
(c)
|
||
(i) For any dimension (let’s denote it generically as D): Output D = floor((Input D +
|
||
2 × Padding - Kernel D ) / Stride) + 1. Therefore, we have:
|
||
Output keyframe = floor((32 + 2 * 0 - 3) / 1) + 1 = 30
|
||
Output width = floor((400 + 2 * 2 - 3) / 2) + 1 = 201
|
||
Output height = floor((300 + 2 * 2 - 3) / 2) + 1 = 151
|
||
The output shape is (30, 201, 151, 100).
|
||
(ii) There are 100 kernels in the shape of (3, 3, 3). Therefore, the number of weight
|
||
parameters is 1000 Ö 3 Ö 3 Ö 3 x 3 = 8100.
|
||
(iii) There is 1 bias parameter per kernel. So the total biases is 100.
|
||
Scheme:
|
||
(a)
|
||
(i) 1 point for giving the correct padding size.
|
||
(ii) 1 point for giving each correct shape (3 shape values). 1 point for giving the correct
|
||
number of parameters. 4 points in total.
|
||
(iii) 1 point for stating the model has overfit the training dataset. 1 point for explaining
|
||
what does the problem usually occur. 1 point for describing a way to mitigate the
|
||
problem. 3 points in total.
|
||
(b) 1 point for giving each correct feature map (2 feature maps). 1 point for giving each
|
||
feature in the fully-connected layer (2 features). 1 point for giving the correct output. 5
|
||
points in total.
|
||
(c)
|
||
(i) 2 points for giving the correct output shape after this layer.
|
||
(ii) 1.5 points for giving the correct number of weight parameters.
|
||
(iii) 1.5 points for giving the correct total number of biases.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'hard', '', '', ''),
|
||
('COMP2211-2024-spring-final', '8', NULL, 8, 'long_question', 'long_answer', 'Problem 8 [10 points] Minimax and Alpha-Beta Pruning
|
||
(a)
|
||
[5 points] The figure below shows a tree in a minimax game with two players.
|
||
A
|
||
B
|
||
C
|
||
D
|
||
E
|
||
F
|
||
G
|
||
H
|
||
I
|
||
J
|
||
MIN
|
||
MAX
|
||
Terminal
|
||
nodes
|
||
Score
|
||
(i) Calculate the best score for the non-terminal nodes with the minimax algorithm.
|
||
(ii) Suppose we are using an alpha-beta pruning algorithm. Indicate which edge will be
|
||
pruned. Gives the alpha-beta values when running. We denote the edge between
|
||
nodes A and B as AB.
|
||
(iii) How do you make a minimax algorithm to find the shortest path to victory? Explain
|
||
in one sentence.
|
||
(b) [5 points] Consider the non zero-sum generalization in which the sum of the two players’
|
||
utilities are not necessarily zero. Because player A’s utility no longer determines player
|
||
B’s utility exactly, the leaf utilities are written as pairs (UA, UB), with the first and second
|
||
component indicating the utility of that leaf to player A and player B respectively. In
|
||
this generalized setting, player A seeks to maximize UA, the first component, while player
|
||
B seeks to maximize UB, the second component.
|
||
(i) Complete the table in the answer book by estimating the value (as pairs) at each of
|
||
the internal node. Assume that each player maximizes their own utility.
|
||
(ii) Is alpha-beta pruning still applicable in this case? Briefly explain why and provide
|
||
an example. (Hint: you can think about the case where UA(s) = UB(s) for all nodes.)', 10, 18, NULL::jsonb, NULL, NULL, 'Problem 8 [10 points] Minimax and Alpha-Beta Pruning
|
||
Solution:
|
||
(a)
|
||
(i) Answer:
|
||
Nodes
|
||
Score
|
||
A
|
||
B
|
||
C
|
||
D
|
||
(ii) Answer:
|
||
Edge
|
||
Alpha
|
||
Beta
|
||
CH
|
||
DJ
|
||
(iii) Record depth information to distinguish paths.
|
||
(b)
|
||
(i) Answer:
|
||
A
|
||
(2,4)
|
||
B
|
||
(0,3)
|
||
C
|
||
(-1,3)
|
||
D
|
||
(1,1)
|
||
E
|
||
(0,-2)
|
||
(ii) No. The values that the first and second player are trying to maximize are inde-
|
||
pendent. Therefore, the principle for pruning in alpha-beta pruning—that a worse
|
||
outcome for one player implies a better outcome for the other—no longer applies.
|
||
For instance, in the case where UA(s) = UB(s) for all nodes, the problem reduces to
|
||
searching for the max-valued leaf, which could appear anywhere in the tree.
|
||
Scheme:
|
||
(a)
|
||
(i) 0.25 point for giving each correct numeric value. 1 point in total.
|
||
(ii) 0.5 point for giving each correct edge/numeric value. 3 points in total.
|
||
(iii) 1 point for giving a way to find the shortest path to victory.
|
||
(b)
|
||
(i) 0.5 point for giving each pair of values. 2.5 points in total.
|
||
(ii) 0.5 point for stating “No”. 1 point for explaining why and 1 point for giving an
|
||
example. 2.5 points in total.', ARRAY['Search and Games']::TEXT[], 'Search and Games', 'Search and Games', ARRAY['Search and Games']::TEXT[], ARRAY['tree_search', 'pruning', 'manual_tracing']::TEXT[], 'medium', '', '', ''),
|
||
('COMP2211-2024-spring-final', '9', NULL, 9, 'long_question', 'short_answer', 'Problem 9 [3 points] Ethics of Artificial Intelligence
|
||
From what areas we should ensure the AI models in production in their organizations func-
|
||
tion ethically?
|
||
-------------------- END OF PAPER
|
||
--------------------
|
||
/* Rough work */
|
||
/* Rough work */
|
||
/* Rough work */
|
||
/* Rough work */
|
||
/* Rough work */', 3, 19, NULL::jsonb, NULL, NULL, 'Problem 9 [3 points] Ethics of Artificial Intelligence
|
||
Solution:
|
||
Data Ethics
|
||
Fair AI model (or avoiding AI model bias)
|
||
AI model monitoring and maintenance.
|
||
Scheme:
|
||
1 point for giving each area correctly. 3 points in total.
|
||
-------------------- END OF PAPER
|
||
--------------------', ARRAY['Ethics of AI']::TEXT[], 'Ethics of AI', 'Ethics of AI', ARRAY['Ethics of AI']::TEXT[], ARRAY['concept_explanation', 'argumentation', 'comparison']::TEXT[], 'easy', '', '', '')
|
||
) AS seed(
|
||
source_exam_key,
|
||
question_number,
|
||
parent_question,
|
||
display_order,
|
||
question_type,
|
||
question_format,
|
||
question_text,
|
||
score,
|
||
page_number,
|
||
options,
|
||
correct_option,
|
||
correct_answer,
|
||
raw_answer_text,
|
||
topics,
|
||
topic_primary,
|
||
analytics_topic,
|
||
topic_tags,
|
||
skill_tags,
|
||
difficulty,
|
||
knowledge_reminder,
|
||
ai_hint,
|
||
solution
|
||
)
|
||
JOIN papers AS p
|
||
ON p.source_exam_key = seed.source_exam_key
|
||
AND p.source_kind = 'course_library'
|
||
WHERE NOT EXISTS (
|
||
SELECT 1 FROM paper_questions q
|
||
WHERE q.paper_id = p.id
|
||
AND q.question_number = seed.question_number
|
||
);
|