-- ============================================
-- PastPaper Master — COMP2211 problem-level questions
-- Seed Date: 2026-03-24
-- ============================================
--
-- Preconditions:
-- 1. Run comp2211_course_library_papers.sql first.
-- 2. Ensure those paper rows already exist in `papers`.
--
-- This seed inserts one row per top-level Problem.
-- It is intentionally coarse-grained and idempotent.

INSERT INTO paper_questions (
  paper_id,
  question_number,
  parent_question,
  display_order,
  question_type,
  question_format,
  question_text,
  score,
  page_number,
  options,
  correct_option,
  correct_answer,
  raw_answer_text,
  topics,
  topic_primary,
  analytics_topic,
  topic_tags,
  skill_tags,
  difficulty,
  knowledge_reminder,
  ai_hint,
  solution
)
SELECT
  p.id,
  seed.question_number,
  seed.parent_question,
  seed.display_order,
  seed.question_type,
  seed.question_format,
  seed.question_text,
  seed.score,
  seed.page_number,
  seed.options,
  seed.correct_option,
  seed.correct_answer,
  seed.raw_answer_text,
  seed.topics,
  seed.topic_primary,
  seed.analytics_topic,
  seed.topic_tags,
  seed.skill_tags,
  seed.difficulty,
  seed.knowledge_reminder,
  seed.ai_hint,
  seed.solution
FROM (
  VALUES
    ('COMP2211-2022-fall-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [15 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 1.5 points for each correct answer.
(a) Machine learning gives computers the ability to make decision by writing down rules and
methods and being explicitly programmed.
(b) The regression technique in machine learning is NOT a group of algorithms that are used
for predicting a class/category.
(c) A 5-fold cross validation for K-nearest neighbors algorithm means that for each value
of K, we randomly select 1/5 of the training data as the validation set to evaluate the
model which is trained by the remaining (4/5) of the training data.
(d) If we use K-means clustering, we will get the same cluster assignments for each data
point, whether or not we standardize the variables.
(e) Given an input data point x, where all the attribute values in x are real numbers. Suppose
you are asked to predict a label y for x, where y = 0 or y = 1. Assume you have no
knowledge about the distributions of x and y, perceptron is an appropriate method for
this problem.
(f) A perceptron with the unit step function (i.e., f(z) = 0 if z ≤0, otherwise f(z) = 1) as
the activation function cannot be used for multi-class classification.
(g) If a training dataset is linearly separable into two classes, the perceptron learning rule
will always converge to weights and bias that accomplish the desired classification.
(h) The neural network weights are updated during forward propagation.
(i) An advantage of gradient descent-based methods, such as back-propagation, is that they
cannot get stuck in local minima.
(j) The back-propagation algorithm, when run until a minimum is achieved, always finds
the same solution (i.e., weights and biases) no matter what the initial set of weights and
biases are.
Question
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)
Answer', 15, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [15 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 1.5 points for each correct answer.
(a) Machine learning gives computers the ability to make decision by writing down rules and
methods and being explicitly programmed.
(b) The regression technique in machine learning is NOT a group of algorithms that are used
for predicting a class/category.
(c) A 5-fold cross validation for K-nearest neighbors algorithm means that for each value
of K, we randomly select 1/5 of the training data as the validation set to evaluate the
model which is trained by the remaining (4/5) of the training data.
(d) If we use K-means clustering, we will get the same cluster assignments for each data
point, whether or not we standardize the variables.
(e) Given an input data point x, where all the attribute values in x are real numbers. Suppose
you are asked to predict a label y for x, where y = 0 or y = 1. Assume you have no
knowledge about the distributions of x and y, perceptron is an appropriate method for
this problem.
(f) A perceptron with the unit step function (i.e., f(z) = 0 if z ≤0, otherwise f(z) = 1) as
the activation function cannot be used for multi-class classification.
(g) If a training dataset is linearly separable into two classes, the perceptron learning rule
will always converge to weights and bias that accomplish the desired classification.
(h) The neural network weights are updated during forward propagation.
(i) An advantage of gradient descent-based methods, such as back-propagation, is that they
cannot get stuck in local minima.
(j) The back-propagation algorithm, when run until a minimum is achieved, always finds
the same solution (i.e., weights and biases) no matter what the initial set of weights and
biases are.
Question
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)
Answer
F
T
F
F
F
T
T
F
F
F
Marking scheme:
 1.5 points for each correct answer. 15 points in total', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-fall-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [19 points] Python Fundamentals
(a)
[7 points] Consider the following NumPy arrays:
import numpy as np
A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
B = np.array([[[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
Write the output for each of the following Python statements. If the output is an empty
array, write “Empty Array”. If an error occurs, write “Error”.
(i) print(A[::3])
(ii) print(A[:-2:-2])
(iii) print(B[:, 1, 3:0:-1])
(iv) print(B[0, 1, [0,3]])
The NumPy array B is repeated here to ease your reading.
B = np.array([[[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
(v) print(B[ B % 4 == 0 ])
(vi) # numpy.sum(a, axis)
# returns the sum of array elements over a given axis
print( np.sum( B, axis=1 ) )
(vii) # numpy.ndarray.reshape(shape)
# returns an array containing the same data with a new shape
print( B.reshape( (3, -1, 2) ) )
(b)
[6 points] Write the output for the following Python code segments. If the output is an
empty array, write “Empty Array”. If an error occurs, write “Error”.
(i) import numpy as np
A = np.array([[1, 2, 3], [2, 4, 6]])
B = np.array([1, 2, 3])
print(A * B)
(ii) import numpy as np
A = np.array([1, 2, 3, 4])
B = np.array([[1, 2], [2, 4], [3, 6], [4, 8]])
print(A + B)
(iii) import numpy as np
A = np.array([1, 2, 3, 4])
B = np.array([[2], [3], [4]])
print(A + B)
(c)
[6 points] The cosine similarity of two non-zero vectors, xTrain = (xTrain
, . . . , xTrain
n
)
and xTest = (xTest
, . . . , xTest
n
), can be calculated by the following formula:
Pn
i=1 (xTrain
i
× xTest
i
)
qPn
i=1 (xTrain
i
)2
qPn
i=1 (xTest
i
)2
For example, if a training sample xTrain is (0, 1, 2) and a testing sample xTest is (4, 6,
8), the cosine similarity is:
0 × 4 + 1 × 6 + 2 × 8
√
02 + 12 + 22√
42 + 62 + 82 = 0.91350028
Given the following NumPy arrays, X_train and X_test, where each 1D array represents
a data point:
import numpy as np
X_train = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]])
X_test = np.array([[4, 6, 8], [5, 0, 0]])
Compute the cosine similarity scores between each data point in X_train and each data
point in X_test with a one-line Python expression, such that the evaluated result of
the expression is:
[[0.91350028 0.
]
[1.
0.37139068]
[0.99461155 0.45584231]]
N ote:
 An expression is a combination of values, variables, operators, and calls to functions.
 Your expression should work with any number of data points in X_train and X_test
and any number of values in the data points.
 You can assume that the number of attribute values in each data point is the same
for both X_train and X_test.
 There must be no explicit loops in your expression.
You may find the following attribute or functions useful for this question.
 Dot product of two arrays, a and b:
numpy.dot(a, b)
– It returns the product of matrix multiplication.
 Return the element-wise square of an array, x
numpy.square(x)
 Return the sum of array elements over a given axis.
numpy.sum(a, axis=None)
– a: the input array with elements to sum.
– axis: None or int or tuple of ints
 Return the non-negative square-root of an array, element-wise.
numpy.sqrt(x)
– x: the input array with values whose square-roots are required.
 Insert a new axis that will appear at the axis position in the expanded array shape.
numpy.expand_dims(a, axis)
– a: the input array.
– axis: an int or tuple of ints that represents position in the expanded axes where
the new axis (or axes) is placed.
 The transposed array.
numpy.ndarray.T
Write the one-line Python expression below:
print(
)', 19, 3, NULL::jsonb, NULL, NULL, 'Problem 2 [19 points] Python Fundamentals
(a)
[7 points] Consider the following NumPy arrays:
Marking scheme:
 1 point each sub-questions, i.e. (i) to (vii)
 No partial score
import numpy as np
A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
B = np.array([[[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
Write the output for each of the following Python statements. If the output is an empty
array, write “Empty Array”. If an error occurs, write “Error”.
(i) print(A[::3])
Answer:
[1 4 7]
(ii) print(A[:-2:-2])
Answer:
[9]
(iii) print(B[:, 1, 3:0:-1])
Answer:
[[ 7
5]
[19 18 17]]
(iv) print(B[0, 1, [0,3]])
Answer:
[4 7]
(v) print(B[ B % 4 == 0 ])
Answer:
[ 0
8 12 16 20]
(vi) # numpy.sum(a, axis)
# returns the sum of array elements over a given axis
print( np.sum( B, axis=1 ) )
Answer:
[[12 15 18 21]
[48 51 54 57]]
(vii) # numpy.ndarray.reshape(shape)
# returns an array containing the same data with a new shape
print( B.reshape( (3, -1, 2) ) )
Answer:
[[[ 0
1]
[ 2
3]
[ 4
5]
[ 6
7]]
[[ 8
9]
[10 11]
[12 13]
[14 15]]
[[16 17]
[18 19]
[20 21]
[22 23]]]
(b)
[6 points] Write the output for the following Python code segments. If the output is an
empty array, write “Empty Array”. If an error occurs, write “Error”.
Marking scheme:
 2 points each sub-questions, i.e. (i) to (iii)
 No partial score
(i) import numpy as np
A = np.array([[1, 2, 3], [2, 4, 6]])
B = np.array([1, 2, 3])
print(A * B)
Answer:
[[ 1
9]
[ 2
8 18]]
(ii) import numpy as np
A = np.array([1, 2, 3, 4])
B = np.array([[1, 2], [2, 4], [3, 6], [4, 8]])
print(A + B)
Answer:
Error
(iii) import numpy as np
A = np.array([1, 2, 3, 4])
B = np.array([[2], [3], [4]])
print(A + B)
Answer:
[[3 4 5 6]
[4 5 6 7]
[5 6 7 8]]
(c)
[6 points] The cosine similarity of two non-zero vectors, xTrain = (xTrain
, . . . , xTrain
n
)
and xTest = (xTest
, . . . , xTest
n
), can be calculated by the following formula:
Pn
i=1 (xTrain
i
× xTest
i
)
qPn
i=1 (xTrain
i
)2
qPn
i=1 (xTest
i
)2
For example, if a training sample xTrain is (0, 1, 2) and a testing sample xTest is (4, 6,
8), the cosine similarity is:
0 × 4 + 1 × 6 + 2 × 8
√
02 + 12 + 22√
42 + 62 + 82 = 0.91350028
Given the following NumPy arrays, X_train and X_test, where each 1D array represents
a data point:
import numpy as np
X_train = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]])
X_test = np.array([[4, 6, 8], [5, 0, 0]])
Compute the cosine similarity scores between each data point in X_train and each data
point in X_test with a one-line Python expression, such that the evaluated result of
the expression is:
[[0.91350028 0.
]
[1.
0.37139068]
[0.99461155 0.45584231]]
N ote:
 An expression is a combination of values, variables, operators, and calls to functions.
 Your expression should work with any number of data points in X_train and X_test
and any number of values in the data points.
 You can assume that the number of attribute values in each data point is the same
for both X_train and X_test.
 There must be no explicit loops in your expression.
You may find the following attribute or functions useful for this question.
 Dot product of two arrays, a and b:
numpy.dot(a, b)
– It returns the product of matrix multiplication.
 Return the element-wise square of an array, x
numpy.square(x)
 Return the sum of array elements over a given axis.
numpy.sum(a, axis=None)
– a: the input array with elements to sum.
– axis: None or int or tuple of ints
 Return the non-negative square-root of an array, element-wise.
numpy.sqrt(x)
– x: the input array with values whose square-roots are required.
 Insert a new axis that will appear at the axis position in the expanded array shape.
numpy.expand_dims(a, axis)
– a: the input array.
– axis: an int or tuple of ints that represents position in the expanded axes where
the new axis (or axes) is placed.
 The transposed array.
numpy.ndarray.T
Write the one-line Python expression below:
print(
)
Answer:
print(X_train.dot(X_test.T) /
np.dot(np.expand_dims(np.sqrt(np.sum(X_train**2, axis=1)), 1),
np.expand_dims(np.sqrt(np.sum(X_test**2, axis=1)), 1).T)
)
print(X_train.dot(X_test.T) /
np.dot(np.expand_dims(np.sqrt(np.sum(X_train**2, axis=1)), 1),
np.expand_dims(np.sqrt(np.sum(X_test**2, axis=1)), 0))
)
print(np.dot(X_train, X_test.transpose()) /
(np.sqrt(np.sum(np.square(X_train),axis=1)[:, None]) *
np.sqrt(np.sum(np.square(X_test), axis=1)[None, :]))
)
print(np.dot(X_train, np.transpose(X_test)) /
(np.sqrt(np.sum(np.square(X_train),axis=1)[:, np.newaxis]) *
np.sqrt(np.sum(np.square(X_test), axis=1)[np.newaxis,:]))
)
print(np.dot(X_train, X_test.T) /
np.sqrt(np.sum(np.square(X_train), axis=1)).reshape(3,1).dot(
np.sqrt(np.sum(np.square(X_test), axis=1)).reshape(1, 2))
)
Marking scheme:
 2 points for correct numerator
– If not using np.dot correctly, deduct 1 point (np.dot can be replaced by np.matmul,
@);
– If not using np.ndarray.T correctly, deduct 1 point (np.ndarray.T can be replaced
by np.transpose);
– The maximum number of points can be deducted for numerator is 2.
 4 points for correct denominator
– If not using np.expand dims correctly (including specifying axis), deduct 1 point
for each (np.expand dims can be replaced by np.reshape or adding a new axis);
– If not using np.dot correctly, deduct 1 point (np.dot can be replaced by np.matmul,
@);
– If not using np.sqrt correctly, deduct 1 point for each;
– If not using np.sum correcly (including specifying axis), deduct 1 point for each;
– If not using np.square correctly, deduct 1 point for each (np.square can be re-
placed by np.ndarray ** 2);
– The maximum number of points can be deducted for denominator is 4. If points
to be deducted exceed 4, the denominator part gets 0 point.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2022-fall-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [18 points] Conditional Probability and Bayes Classifier
(a)
[4 points] Assume the probability of getting an A+ in COMP 2211 is 0.08. The prob-
ability of getting a score greater than 90 in the midterm exam given that a student
gets an A+ in COMP 2211 is 0.95, and the probability of getting a score greater than
90 in the midterm exam given that the student does not get an A+ in COMP 2211 is 0.05.
(i) Calculate the probability of getting a score greater than 90 in the midterm exam.
Show all your steps. Round your answer to 2 decimal places.
The formula that you may find useful for this question:
P(E) = P(E|B)P(B) + P(E|Not B)P(Not B)
(ii) Use Bayes’ rule to calculate the probability of getting an A+ given that a student
gets a score greater than 90 in the midterm exam. Show all your steps. Round your
answer to 2 decimal places.
The formula that you may find useful for this question:
P(B|E) = P(B)P(E|B)
P(E)
(b)
[8 points] Given the following dataset (X, y) with 10 training examples, each contains
2 attributes (x1, x2) and its binary label y.
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
y = [0 0 1 1 1 1 0 0 1 1]
Suppose you believe that a naive Bayes model would be appropriate for this dataset, and
you want to classify the test sample:
x = [1 1]
(i) Compute the class prior probabilities, i.e., P(y = 0) and P(y = 1).
(ii) Compute the 4 conditional probabilities required by naive Bayes for the test sample,
i.e., P(x1 = 1|y = 0), P(x2 = 1|y = 0), P(x1 = 1|y = 1), P(x2 = 1|y = 1).
(iii) Under the naive Bayes model and the probabilities you compute in parts b(i) and
b(ii), what is the most likely label for the test sample, i.e., 0 or 1? If there is a tie,
please include the word “tie” in your answer. Show all your steps.
The formula that you may find useful for this question:
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) . . . P(ed|Bi))
(c)
[3 points] Briefly describe a zero frequency problem of Naive Bayes classification and
suggest a way to solve the problem. Also, state whether a zero frequency problem occurs
in part (b).
The dataset (X, y) is repeated here to ease your reading.
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
y = [0 0 1 1 1 1 0 0 1 1]
(d)
[3 points] Consider the dataset given in part (b), i.e., the one above. Suppose you do
NOT believe that the naive Bayes model would be appropriate for this dataset, and you
want to classify the test sample:
x = [0 0]
using the Bayes model without making the naive assumption. What is the most
likely label for the test sample? If there is a tie, please include the word “tie” in your
answer. Show all your steps.
The formula that you may find useful for this question:
BNB = argmaxBiP(Bi)P((e1, e2, e3, . . . , ed)|Bi)', 18, 8, NULL::jsonb, NULL, NULL, 'Problem 3 [18 points] Conditional Probability and Bayes Classifier
(a)
[4 points] Assume the probability of getting an A+ in COMP 2211 is 0.08. The prob-
ability of getting a score greater than 90 in the midterm exam given that a student
gets an A+ in COMP 2211 is 0.95, and the probability of getting a score greater than
90 in the midterm exam given that the student does not get an A+ in COMP 2211 is 0.05.
(i) [3 points] Calculate the probability of getting a score greater than 90 in the midterm
exam. Show all your steps. Round your answer to 2 decimal places.
The formula that you may find useful for this question:
P(E) = P(E|B)P(B) + P(E|Not B)P(Not B)
Answer:
P(A+) =0.08
P(> 90|A+) =0.95
P(> 90|Not A+) =0.05
P(> 90) =P(> 90|A+)P(A+) + P(> 90|Not A+)P(Not A+)
=0.95(0.08) + 0.05(0.92) = 0.12
Marking scheme:
 0.5 point for P(A+) = 0.08
 0.5 point for P(> 90|A+) = 0.95
 0.5 point for P(> 90|Not A+) = 0.05
 0.5 point for P(Not A+) = 0.92
 1 point for P(> 90) = 0.12
(ii)
[1 point] Use Bayes’ rule to calculate the probability of getting an A+ given that
a student gets a score greater than 90 in the midterm exam. Show all your steps.
Round your answer to 2 decimal places.
The formula that you may find useful for this question:
P(B|E) = P(B)P(E|B)
P(E)
Answer:
P(A + | > 90) = P(A+)P(> 90|A+)
P(> 90)
= 0.08(0.95)
0.12
= 0.63
Marking scheme:
 1 point for P(A + | > 90) = 0.63
(b)
[8 points] Given the following dataset (X, y) with 10 training examples, each contains
2 attributes (x1, x2) and its binary label y.
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
y = [0 0 1 1 1 1 0 0 1 1]
Suppose you believe that a naive Bayes model would be appropriate for this dataset, and
you want to classify the test sample:
x = [1 1]
(i)
[1 point] Compute the class prior probabilities, i.e., P(y = 0) and P(y = 1).
Answer:
P(y = 0) = 4
10 = 2
P(y = 1) = 6
10 = 3
Marking scheme:
 0.5 point for P(y = 0) = 2
 0.5 point for P(y = 1) = 3
(ii)
[4 points] Compute the 4 conditional probabilities required by naive Bayes for
the test sample, i.e., P(x1 = 1|y = 0), P(x2 = 1|y = 0), P(x1 = 1|y = 1),
P(x2 = 1|y = 1).
Answer:
P(x1 = 1|y = 0) = 0
P(x2 = 1|y = 0) = 3
P(x1 = 1|y = 1) = 5
P(x2 = 1|y = 1) = 3
6 = 1
Marking scheme:
 1 point for P(x1 = 1|y = 0) = 0
 1 point for P(x2 = 1|y = 0) = 3
 1 point for P(x1 = 1|y = 1) = 5
 1 point for P(x2 = 1|y = 1) = 1
(iii)
[3 points] Under the naive Bayes model and the probabilities you compute in parts
b(i) and b(ii), what is the most likely label for the test sample, i.e., 0 or 1? If there
is a tie, please include the word “tie” in your answer. Show all your steps.
The formula that you may find useful for this question:
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) . . . P(ed|Bi))
Answer:
The numerator part of P(y = 0|x) = P(y = 0)P(x1 = 1|y = 0)P(x2 = 1|y = 0)
= 0
The numerator part of P(y = 1|x) = P(y = 1)P(x1 = 1|y = 1)P(x2 = 1|y = 1)
= 3
5
 1

= 1
So, the most likely label for the test sample is 1.
Marking scheme:
 1 point for the numerator part of P(y = 0|x) = 0
 1 point for the numerator part of P(y = 1|x) = 1
 1 point for stating the most likely label for the test sample is 1
(c)
[3 points] Briefly describe a zero frequency problem of Naive Bayes classification and
suggest a way to solve the problem. Also, state whether a zero frequency problem occurs
in part (b).
Answer:
If categorical variable has a category in test data set, which was not observed in the
training data set, then the frequency-based probability estimate will be zero. And we
will get a zero when all the probabilities are multiplied. This will be unable to make a
prediction. This is known as zero frequency.
A way to overcome this “zero frequency problem” is to add one to the count for ev-
ery attribute value-class combination when an attribute value does not occur with every
class value.
A zero frequency problem occurs in part (b).
Marking scheme:
 1 point for briefly describing a zero frequency problem
 1 point for suggesting a way to solve the problem
*** Please note that suggesting “adding more training data/increase the dataset
size/Using Log()” will not get any mark ***
 1 point for stating a zero frequency problem occurs in part (b)
The dataset (X, y) is repeated here to ease your reading.
X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]]
y = [0 0 1 1 1 1 0 0 1 1]
(d)
[3 points] Consider the dataset given in part (b), i.e., the one above. Suppose you do
NOT believe that the naive Bayes model would be appropriate for this dataset, and you
want to classify the test sample:
x = [0 0]
using the Bayes model without making the naive assumption. What is the most
likely label for the test sample? If there is a tie, please include the word “tie” in your
answer. Show all your steps.
The formula that you may find useful for this question:
BNB = argmaxBiP(Bi)P((e1, e2, e3, . . . , ed)|Bi)
Answer:
The numerator part of P(y = 0|x) = P(y = 0)P((x1 = 0, x2 = 0)|y = 0)
=
2
 1

= 1
The numerator part of P(y = 1|x) = P(y = 1)P((x1 = 0, x2 = 0)|y = 1)
=
3
 1

= 1
As the two probabilities are the same value, it’s a tie.
Marking scheme:
 1 point for the numerator part of P(y = 0|x) = 1
 1 point for the numerator part of P(y = 1|x) = 1
 1 point for stating it’s a tie', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2022-fall-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors
Consider a set of 8 training data given as (xTrain, cTrain) values, where xTrain is a string with
6 bits and cTrain is the multi-class label, A, B or C:
{ ("101110", A), ("110100", B), ("011100", B), ("000101", B),
("011111", B), ("101101", B), ("100011", A), ("010000", C) }
Classify a test sample (xTest) with the string “101011” using a K-Nearest Neighbors classifier
(KNN) and Hamming distance.
Hamming distance is the number of bit positions in which the two bits are different. For
example, the hamming distance between two strings, “11011001” and “10011101” is 2.
(a)
[5 points] Complete the following table by filling in the computed Hamming distance
between each training data point and the test sample. Also, determine the class label of
the test sample using KNN with K = 3 based on the results.
xTrain
cTrain
Distance
“101110”
A
“110100”
B
“011100”
B
“000101”
B
“011111”
B
“101101”
B
“100011”
A
“010000”
C
The predicted class label of the test sample is
.
(b)
[3 points] What will happen if we use a KNN classifier with K = 4 instead to classify
the test sample above? Describe your answer.
(c)
[3 points] A student claims that if a KNN classifier with K = 3 is used with the above
training data, the predicted class label will never be class C no matter what the test
example is, if there is no tie-breaking strategy used. Is this claim correct or not? Explain
your answer.
(d)
[3 points] Is the above training data suitable for performing a D-fold cross-validation?
Explain your answer.', 14, 12, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors
Consider a set of 8 training data given as (xTrain, cTrain) values, where xTrain is a string with
6 bits and cTrain is the multi-class label, A, B or C:
{ ("101110", A), ("110100", B), ("011100", B), ("000101", B),
("011111", B), ("101101", B), ("100011", A), ("010000", C) }
Classify a test sample (xTest) with the string “101011” using a K-Nearest Neighbors classifier
(KNN) and Hamming distance.
Hamming distance is the number of bit positions in which the two bits are different. For
example, the hamming distance between two strings, “11011001” and “10011101” is 2.
(a)
[5 points] Complete the following table by filling in the computed Hamming distance
between each training data point and the test sample. Also, determine the class label of
the test sample using KNN with K = 3 based on the results.
xTrain
cTrain
Distance
“101110”
A
“110100”
B
“011100”
B
“000101”
B
“011111”
B
“101101”
B
“100011”
A
“010000”
C
The predicted class label of the test sample is
.
Answer:
xTrain
cTrain
Distance
“101110”
A
“110100”
B
“011100”
B
“000101”
B
“011111”
B
“101101”
B
“100011”
A
“010000”
C
The predicted class label of the test sample is A.
Marking scheme:
 0.5 point for each distance, i.e. 4 points total
 1 point for the predicted class label
(b)
[3 points] What will happen if we use a KNN classifier with K = 4 instead to classify
the test sample above? Describe your answer.
Answer:
We will have a tie where 2 nearest neighbors are with class label A and 2 nearest neighbors
are with class label B.
Marking scheme:
 1 point for mentioning “Tie” or ”cannot find the class label”
 2 points for the description of “2 NNs are class label A and 2 NNs are class label B”
(c)
[3 points] A student claims that if a KNN classifier with K = 3 is used with the above
training data, the predicted class label will never be class C no matter what the test
example is, if there is no tie-breaking strategy used. Is this claim correct or not? Explain
your answer.
Answer:
The claim is correct. If k = 3 and assume equal weights for all the points, no mat-
ter what the testing example is, it will be one of the following 3 cases:
 none of the 3 nearest neighbors is with class label C, so the predicted label will not
be C.
 1 of the 3 nearest neighbors is with class label C and the other 2 neighbors are both
with class label A, or both with class label B, so the predicted class label will only
be A or B.
 A tie
Marking scheme:
 1 point for “correct” or “yes”
 2 points for the explanation
(d)
[3 points] Is the above training data suitable for performing a D-fold cross-validation?
Explain your answer.
Answer:
No, unless the number of folds is equal to the number of training data points.
The
above training data set is umbalanced. Hence, if we split the above training data set, the
data used for training may not have samples with class labels A or C.
Marking scheme:
 1 point for “no”
 2 points for the explanation', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-fall-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [12 points] K-Means Clustering
Consider the following training dataset:
Example
A
B
C
D
E
Attribute value
0.2
0.5
0.7
2.5
3.5
Apply the K-means clustering algorithm to this dataset for K=2, i.e., you will produce two
data clusters. Assume the distance measure you use is Euclidean distance, defined as follows.
distance(xTrain, xTest) =
v
u
u
t
n
X
i=1
(xTrain
i
−xTest
i
)2
(a) [1.5 points] Assume examples A and B are chosen as the initial centroid of clusters 1 and
cluster 2, respectively. Write down the resulting cluster assignments in the table below.
The cluster assignment for A and B has been done for you.
Example
A
B
C
D
E
Cluster Assignment
(b) [2 points] After assigning the examples to the clusters in part (a), re-compute the cluster
centroids to be the mean of the examples currently assigned to each cluster. For each
cluster, write the new cluster centroid in the table below.
Cluster
Centroid
(c)
[2.5 points] After recomputing the cluster centroids in part (b), re-assign the examples
to the clusters to which they are closest. Write down the resulting cluster assignments
in the table below.
Example
A
B
C
D
E
Cluster Assignment
(d) [2 points] After assigning the examples to the clusters in part (c), re-compute the cluster
centroids to be the mean of the examples currently assigned to each cluster. For each
cluster, write the new cluster centroid in the table below. If your answer is a floating
point number, round it to 2 decimal places.
Cluster
Centroid
(e)
[2.5 points] After recomputing the cluster centroids in part (d), re-assign the examples
to the clusters to which they are closest. Write down the resulting cluster assignments
in the table below.
Example
A
B
C
D
E
Cluster Assignment
(f)
[1.5 points] State whether the K-means clustering algorithm converges for this dataset
after performing all the steps in parts (a)-(e). Explain your answer.', 12, 14, NULL::jsonb, NULL, NULL, 'Problem 5 [12 points] K-Means Clustering
Consider the following training dataset:
Example
A
B
C
D
E
Attribute value
0.2
0.5
0.7
2.5
3.5
Apply the K-means clustering algorithm to this dataset for K=2, i.e., you will produce two
data clusters. Assume the distance measure you use is Euclidean distance, defined as follows.
distance(xTrain, xTest) =
v
u
u
t
n
X
i=1
(xTrain
i
−xTest
i
)2
(a) [1.5 points] Assume examples A and B are chosen as the initial centroid of clusters 1 and
cluster 2, respectively. Write down the resulting cluster assignments in the table below.
The cluster assignment for A and B has been done for you.
Example
A
B
C
D
E
Cluster Assignment
Marking scheme:
 0.5 point for each correct cluster assignment. 1.5 points in total
(b) [2 points] After assigning the examples to the clusters in part (a), re-compute the cluster
centroids to be the mean of the examples currently assigned to each cluster. For each
cluster, write the new cluster centroid in the table below.
Cluster
Centroid
0.2
1.8
Marking scheme:
 0.5 point for each correct centroid. 1 point in total
(c)
[2.5 points] After recomputing the cluster centroids in part (b), re-assign the examples
to the clusters to which they are closest. Write down the resulting cluster assignments
in the table below.
Example
A
B
C
D
E
Cluster Assignment
Marking scheme:
 0.5 point for each correct cluster assignment. 2.5 points in total
(d) [2 points] After assigning the examples to the clusters in part (c), re-compute the cluster
centroids to be the mean of the examples currently assigned to each cluster. For each
cluster, write the new cluster centroid in the table below. If your answer is a floating
point number, round it to 2 decimal places.
Cluster
Centroid
0.47
Marking scheme:
 1 point for each correct centroid. 2 points in total
(e)
[2.5 points] After recomputing the cluster centroids in part (d), re-assign the examples
to the clusters to which they are closest. Write down the resulting cluster assignments
in the table below.
Example
A
B
C
D
E
Cluster Assignment
Marking scheme:
 0.5 point for each correct cluster assignment. 2.5 points in total
(f)
[1.5 points] State whether the K-means clustering algorithm converges for this dataset
after performing all the steps in parts (a)-(e). Explain your answer.
Answer:
The algorithm converges for this dataset since the cluster assignment of examples re-
mains the same.
Marking scheme:
 0.5 point for stating the algorithm converges
 1 point for the correct explanation', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-fall-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [14 points] Perceptron
Suppose you are a CSE professor at HKUST who wants to offer a new course, COMP 2211.
Before starting, you want to predict if COMP 2211 will be successful or not. You approach
two students, A and B, asking them to read the proposal of 5 existing courses, COMP 1111,
COMP 2222, COMP 3333, COMP 4444, and COMP 5555, and rate them on a scale of 1
to 5 (assume only integer scores). Assume each student reads the proposals independently
and announces their rating. As each student might be biased, you may be unable to average
their ratings. Instead, you decide to use a perceptron to classify the data using the training
dataset in Table 1.
Course Code
x1
x2
Success
COMP 1111
No
COMP 2222
Yes
COMP 3333
No
COMP 4444
Yes
COMP 5555
Yes
Table 1: Training dataset
The training dataset is composed of the two student scores, x1 = score given by student A,
and x2 = score given by student B, and the corresponding true label regarding the success
of the courses judged based on the SFQ, for the 5 courses.
(a) [8 points] Train the perceptron to generate O = 1 if the course is success, O = 0 otherwise,
using the initial weight of w1 = 0, w2 = 0, θ = -1, and the activation function:
f(z) =



if z ≤0
otherwise
Also, use a learning rate of η = 1.
You may find the updating rules below useful.
∆wi = η(T −O)xi
∆θ = η(T −O)
wi = wi + ∆wi
θ = θ + ∆θ
Present each row of Table 1 as a training example, and update the perceptron weights
and bias before moving on to the next row. Show all the steps by completing Table 2.
x1
x2
T
O
∆w1
w1
∆w2
w2
∆θ
θ
-
-
-
-
-
-
-
-1
Table 2: Perceptron algorithm execution
(b)
[6 points] Now, assume you do not care about the true label regarding the success of
the courses judged based on the SFQ. State whether each of the following scenarios for
which a perceptron using the ratings of the two students can correctly classify the data.
Justify your answer.
(i) If the total of their ratings is more than 8, then the course will be success and
otherwise it will fail, i.e.,
Course Code
x1
x2
Success
COMP 1111
No
COMP 2222
No
COMP 3333
Yes
COMP 4444
No
COMP 5555
No
(ii) The course will succeed if and only if each reviewer gives either a rating of 2 or a
rating of 3, i.e.,
Course Code
x1
x2
Success
COMP 1111
No
COMP 2222
Yes
COMP 3333
No
COMP 4444
No
COMP 5555
Yes', 14, 16, NULL::jsonb, NULL, NULL, 'Problem 6 [14 points] Perceptron
Suppose you are a CSE professor at HKUST who wants to offer a new course, COMP 2211.
Before starting, you want to predict if COMP 2211 will be successful or not. You approach
two students, A and B, asking them to read the proposal of 5 existing courses, COMP 1111,
COMP 2222, COMP 3333, COMP 4444, and COMP 5555, and rate them on a scale of 1
to 5 (assume only integer scores). Assume each student reads the proposals independently
and announces their rating. As each student might be biased, you may be unable to average
their ratings. Instead, you decide to use a perceptron to classify the data using the training
dataset in Table 1.
Course Code
x1
x2
Success
COMP 1111
No
COMP 2222
Yes
COMP 3333
No
COMP 4444
Yes
COMP 5555
Yes
Table 1: Training dataset
The training dataset is composed of the two student scores, x1 = score given by student A,
and x2 = score given by student B, and the corresponding true label regarding the success
of the courses judged based on the SFQ, for the 5 courses.
(a) [8 points] Train the perceptron to generate O = 1 if the course is success, O = 0 otherwise,
using the initial weight of w1 = 0, w2 = 0, θ = -1, and the activation function:
f(z) =



if z ≤0
otherwise
Also, use a learning rate of η = 1.
You may find the updating rules below useful.
∆wi = η(T −O)xi
∆θ = η(T −O)
wi = wi + ∆wi
θ = θ + ∆θ
Present each row of Table 1 as a training example, and update the perceptron weights
and bias before moving on to the next row. Show all the steps by completing Table 2.
x1
x2
T
O
∆w1
w1
∆w2
w2
∆θ
θ
-
-
-
-
-
-
-
-1
-1
-4
-1
-5
-3
-1
-1
Table 2: Perceptron algorithm execution
Marking scheme:
 0.2 point for each correct value. 40 values, 8 points in total
(b)
[6 points] Now, assume you do not care about the true label regarding the success of
the courses judged based on the SFQ. State whether each of the following scenarios for
which a perceptron using the ratings of the two students can correctly classify the data.
Justify your answer.
(i)
[3 points] If the total of their ratings is more than 8, then the course will be success
and otherwise it will fail, i.e.,
Course Code
x1
x2
Success
COMP 1111
No
COMP 2222
No
COMP 3333
Yes
COMP 4444
No
COMP 5555
No
Answer:
Perceptron can correctly classify the data in this scenario, because the data are
linearly separable.
Marking scheme:
 1 point for stating the perceptron can correctly classify the data
 2 points for the explanation
*** 2 points for the explanation part if correct weights and bias are given. ***
(ii)
[3 points] The course will succeed if and only if each reviewer gives either a rating
of 2 or a rating of 3, i.e.,
Course Code
x1
x2
Success
COMP 1111
No
COMP 2222
Yes
COMP 3333
No
COMP 4444
No
COMP 5555
Yes
Answer:
Perceptron cannot correctly classify the data in this scenario, because the data are
not linearly separable.
Marking scheme:
 1 point for stating the perceptron cannot correctly classify the data
 2 points for the explanation', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-fall-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [8 points] Multilayer Perceptron
Given the following training dataset with 4 examples in which each example has 3 binary
input attributes, x1, x2, x3 and 1 binary output T.
x1
x2
x3
T
Suppose we are learning a Multilayer Perceptron with the training dataset above; the neural
network has 1 hidden layer of 2 neurons, answer the following questions.
(a)
[2 points] How many weights and how many biases are there in the neural network?
(b)
[6 points] Draw the neural network below.
-------------------- END OF PAPER
--------------------
/* Rough work */
/* Rough work */', 8, 18, NULL::jsonb, NULL, NULL, 'Problem 7 [8 points] Multilayer Perceptron
Given the following training dataset with 4 examples in which each example has 3 binary
input attributes, x1, x2, x3 and 1 binary output T.
x1
x2
x3
T
Suppose we are learning a Multilayer Perceptron with the training dataset above; the neural
network has 1 hidden layer of 2 neurons, answer the following questions.
(a)
[2 points] How many weights and how many biases are there in the neural network?
Answer:
8 weights and 3 biases
Marking scheme:
 1 point for the number of weights
 1 point for the number of biases
*** 1 point if students only put 11 as the sum of these variables (e.g., 8 + 3 = 11) and
do not indicate 8 for weights and 3 for biases. ***
(b)
[6 points] Draw the neural network below.
Answer:
Marking scheme:
 3 points for indicating 3 input nodes, 2 hidden neurons, and 1 output neuron.
-0.5 point for losing 1 node/neuron.
 1 point for indicating 8 weights.
-0.5 point for losing 1 weight indicator, at most -1.
 1 point for indicating 3 biases.
-0.5 point for losing 1 bias indicator, at most -1.
 1 point for lines that connected layers.
-9,5 point for losing 1 connecting line, at most -1.
*** Additionally, suppose the number of nodes/neurons, the number of weights, the
number of biases, and the number of lines are more than expected. In that case, we will
only consider those correct ones, and the layer containing additional nodes/neurons will
be regarded as wrong. (e.g. if you put 3-¿4-¿1 as network, then only the input nodes
and output neuron consider to be correct, while all lines, weights, and biases should be
wrong, you can get 2 points.) ***
-------------------- END OF PAPER
--------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'easy', '', '', ''),
    ('COMP2211-2022-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [15 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 1.5 points for each correct answer.
(a) Machine learning is a sub-field of artificial intelligence.
(b) Tensorflow is easy to use and less flexible than Keras.
(c) Suppose we wish to calculate P(B|e1, e2) and we have no conditional independence in-
formation. Having P(e1, e2), P(B), P(e1, e2|B) are sufficient for the calculation.
(d) Training a K-nearest neighbors classifier takes more computational time than applying
it.
(e) K-nearest neighbors cannot be used for regression.
(f) Consider D-fold cross-validation. A higher number of folds (i.e. larger value of D), the
estimated error will be lower on average.
(g) K-means clustering algorithm is guaranteed to converge.
(h) Consider a two-class classification problem. Suppose we have trained a perceptron model
on a linearly separable training set, and now we get a new labeled data point which is
correctly classified by the model, and far away from the decision boundary. If we add
this new point to our earlier training set and re-train with the same set of initial weights
and bias, the learnt decision boundary will be changed for sure.
(i) Gradient descent is usually NOT guaranteed to converge at global minimum.
(j) If the learning rate is too small, gradient descent may take a very long time to converge
and computationally expensive.
Question
Answer (T/F)
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)', 15, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [15 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 1.5 points for each correct answer.
(a) Machine learning is a sub-field of artificial intelligence.
(b) Tensorflow is easy to use and less flexible than Keras.
(c) Suppose we wish to calculate P(B|e1, e2) and we have no conditional independence in-
formation. Having P(e1, e2), P(B), P(e1, e2|B) are sufficient for the calculation.
(d) Training a K-nearest neighbors classifier takes more computational time than applying
it.
(e) K-nearest neighbors cannot be used for regression.
(f) Consider D-fold cross-validation. A higher number of folds (i.e. larger value of D), the
estimated error will be lower on average.
(g) K-means clustering algorithm is guaranteed to converge.
(h) Consider a two-class classification problem. Suppose we have trained a perceptron model
on a linearly separable training set, and now we get a new labeled data point which is
correctly classified by the model, and far away from the decision boundary. If we add
this new point to our earlier training set and re-train with the same set of initial weights
and bias, the learnt decision boundary will be changed for sure.
(i) Gradient descent is usually NOT guaranteed to converge at global minimum.
(j) If the learning rate is too small, gradient descent may take a very long time to converge
and computationally expensive.
Question
Answer (T/F)
(a)
T
(b)
F
(c)
T
(d)
F
(e)
F
(f)
T
(g)
T
(h)
F
(i)
T
(j)
T
Marking scheme:
 1.5 points for each correct answer.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [16 points] Python Fundamentals
For each of the Python expressions below, write the output when the expression is evaluated.
If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”.
(a)
[5 points] Consider the following NumPy arrays:
import numpy as np
A = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
B = np.array([ [0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11],
[12, 13, 14, 15] ])
(i) print(A[2:6:3])
(ii) print(B[0:2, 1:3])
(iii) print(A[-1:-3])
(iv) print(B[::-1])
(v) print(B[:3,2:])
(b)
[2 points] What is the output of the following code?
import numpy as np
X = np.array([2,2,0,2,2,0,1,1,0,1,1,0])
Y = X[X != 0]
print(Y[::2])
(c)
[9 points] Given the following code which computes the distance between each training
point in X train and each test point in X test using a nested loop over both the training
data and test data.
import numpy as np
def compute_distances_nested_loops(X_train, X_test):
num_test = X_test.shape[0]
num_train = X_train.shape[0]
distances = np.zeros((num_test, num_train))
# --- BLOCK TO REWRITE ---
for i in range(num_test):
for j in range(num_train):
distances[i,j] = np.sqrt(np.sum(np.square(X_test[i,:]-X_train[j,:])))
# --- BLOCK TO REWRITE ---
return distances
Train = np.array([[0,1], [1,2], [2,3], [3,4]])
Test = np.array([[5,6], [7,8]])
print(compute_distances_nested_loops(Train, Test))
# Output:
# [[7.07106781 5.65685425 4.24264069 2.82842712]
#
[9.89949494 8.48528137 7.07106781 5.65685425]]
Rewrite the block of code between the comment lines # --- BLOCK TO REWRITE ---
using no explicit loops in the space provided.
Hints:
 To compute the distance between training data point (0, 1) and test data point (5, 6),
we do
dist = (0 −5)2 + (1 −6)2
 Expand it
dist = 02 −2(0)(5) + 52 + 12 −2(1)(6) + 62
= 02 + 12 + 52 + 62 −2(0)(5) −2(1)(6)
= 02 + 12 + 52 + 62 −2((0)(5) + (1)(6))
You may find the following functions useful for this question.
 Dot product of two arrays:
numpy.dot(a, b)
– It returns the product of matrix multiplication.
 Return the element-wise square of the input
numpy.square(x)
– x is the input data
 Return the sum of array elements over a given axis.
numpy.sum(a, axis=None)
– a is an array with elements to sum.
– axis: None or int or tuple of ints. axis = 0 means along the column and axis =
1 means working along ther row.
 Return the non-negative square-root of an array, element-wise.
numpy.sqrt(x)
– x is the array with values whose square-roots are required.
 Return a matrix (or a 2D array) from an 1D array.
numpy.matrix(data)
– data is the 1D array.
– Example: numpy.matrix([1, 2, 3]), output is [[1, 2, 3]]
 Insert a new axis that will appear at the axis position in the expanded array shape.
numpy.expand_dims(a,axis)
– a is the input array.
– axis is an int or tuple of ints that represents poisition in the expanded axes where
the new axis (or axes) is placed.', 16, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [16 points] Python Fundamentals
For each of the Python expressions below, write the output when the expression is evaluated.
If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”.
(a)
[5 points] Consider the following NumPy arrays:
import numpy as np
A = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
B = np.array([ [0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11],
[12, 13, 14, 15] ])
(i) print(A[2:6:3])
Answer: [30 60]
# 1 point
(ii) print(B[0:2, 1:3])
Answer:
[[1 2]
[5 6]]
# 1 point
(iii) print(A[-1:-3])
Answer:
Empty Array
# 1 point
(iv) print(B[::-1])
Answer:
[[12 13 14 15]
# 1 point
[ 8
9 10 11]
[ 4
7]
[ 0
3]]
(v) print(B[:3,2:])
Answer:
[[ 2
3]
# 1 point
[ 6
7]
[10 11]]
Remark:
 0.5 point will be deducted if the answers are in raw number format without [](i.e.
1 2 5 6). Because in this case, the answer is no longer in array type, and we can-
not distinguish whether the answer is a 2d array or 1d array. This deduction only
performs once for a(i) and a(ii).
(b)
[2 points] What is the output of the following code?
import numpy as np
X = np.array([2,2,0,2,2,0,1,1,0,1,1,0])
Y = X[X != 0]
print(Y[::2])
Answer:
[2 2 1 1]
# 2 points
Remark:
 0.5 point will be deducted if the answers are in raw number format without [](i.e.
1 2 5 6). Because in this case, the answer is no longer in array type, and we can-
not distinguish whether the answer is a 2d array or 1d array. This deduction only
performs once for for a(i) and a(ii).
(c)
[9 points] Given the following code which computes the distance between each training
point in X train and each test point in X test using a nested loop over both the training
data and test data.
import numpy as np
def compute_distances_nested_loops(X_train, X_test):
num_test = X_test.shape[0]
num_train = X_train.shape[0]
distances = np.zeros((num_test, num_train))
# --- BLOCK TO REWRITE ---
for i in range(num_test):
for j in range(num_train):
distances[i,j] = np.sqrt(np.sum(np.square(X_test[i,:]-X_train[j,:])))
# --- BLOCK TO REWRITE ---
return distances
Train = np.array([[0,1], [1,2], [2,3], [3,4]])
Test = np.array([[5,6], [7,8]])
print(compute_distances_nested_loops(Train, Test))
# Output:
# [[7.07106781 5.65685425 4.24264069 2.82842712]
#
[9.89949494 8.48528137 7.07106781 5.65685425]]
Rewrite the block of code between the comment lines # --- BLOCK TO REWRITE ---
using no explicit loops in the space provided.
Hints:
 To compute the distance between training data point (0, 1) and test data point (5, 6),
we do
dist = (0 −5)2 + (1 −6)2
 Expand it
dist = 02 −2(0)(5) + 52 + 12 −2(1)(6) + 62
= 02 + 12 + 52 + 62 −2(0)(5) −2(1)(6)
= 02 + 12 + 52 + 62 −2((0)(5) + (1)(6))
You may find the following functions useful for this question.
 Dot product of two arrays:
numpy.dot(a, b)
– It returns the product of matrix multiplication.
 Return the element-wise square of the input
numpy.square(x)
– x is the input data
 Return the sum of array elements over a given axis.
numpy.sum(a, axis=None)
– a is an array with elements to sum.
– axis: None or int or tuple of ints. axis = 0 means along the column and axis =
1 means working along ther row.
 Return the non-negative square-root of an array, element-wise.
numpy.sqrt(x)
– x is the array with values whose square-roots are required.
 Return a matrix (or a 2D array) from an 1D array.
numpy.matrix(data)
– data is the 1D array.
– Example: numpy.matrix([1, 2, 3]), output is [[1, 2, 3]]
 Insert a new axis that will appear at the axis position in the expanded array shape.
numpy.expand_dims(a,axis)
– a is the input array.
– axis is an int or tuple of ints that represents poisition in the expanded axes where
the new axis (or axes) is placed.
Answer:
M = np.dot(X_test, X_train.T)
te = np.square(X_test).sum(axis=1)
tr = np.square(X_train).sum(axis=1)
dists = np.sqrt(-2*M + np.matrix(tr) + np.matrix(te).T)
Alternative answer:
distances = np.sqrt(np.sum(np.square(np.expand_dims(X_test, 1) - X_train), axis=2))
distances = np.sqrt(np.sum(np.square(X_test[:,None] - X_train[None]), axis=-1))
Marking scheme:
 Case 0: Any for loop, list comprehension. # 0 point
 Case 1:
(a) M = np.dot(X_test, X_train.T) # 2 points
1 point if missing transpose
(b) te = np.square(X_test).sum(axis=1) # 2 points
tr = np.square(X_train).sum(axis=1) # 2 points
– each square # 1 point
– each sum with right axis # 1 point
(c) dists = np.sqrt(-2*M + np.matrix(tr) + np.matrix(te).T) # 3 points
– 1 point if shape doesn’t match.
– 2 points if np.matrix is missing, or attempt to modify the shape but wrong
– if np.sqrt is missing,
* if previous part isn’t calculated correctly, 0 point for this part.
* -1 point otherwise.
 Case 2:
distances = np.sqrt(np.sum(np.square(np.expand_dims(X_test, 1) - X_train), axis=2))
# 4 points
(a) 4 points for the subtraction
– 2 points if the shape modification is wrong.
– 0 points if no shape modification at all.
(b) 2 points for square
(c) 2 points for sum;
1 point if axis is missing or wrong
(d) 1 point for sqrt (unlike Case 1, sqrt is worth 1 point here.)
(e) -3 points if a2–b2 is used.
Remarks:
(a) If reshaped is used to change the shape of the solution, -1 point.
(b) If the program assume there’s 4 train points or 2 test points, and the number of
dimension is 2, -1 point or 0 point for the whole question.
(c) If Train, Test used instead of Xtrain, Xtest, -1 point. If your program are terribly
wrong (< 3 points) anyway, this mark wouldn’t be deducted.
(d) If extra code which leads to wrong answer, -1 point.
(e) If the output shape is (numtrain, numtest) instead of (numtest, numtrain), -1 point.
(f) If expand dims/reshape used without reassigning the variable, i.e. a = a.reshape(*b.shape),
-1 point.
(g) If missing shape, (e.g. Xtrain[0] instead of Xtrain.shape[0]), – 0.5 point.
(h) Wrong spelling and syntax will not resulted in mark deduction, but 0.5 point will be
deducted if you got full points.
(i) Index using i j assuming the existence of for loop gets 0 point.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2022-spring-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [11 points] Conditional Probability and Bayes Classifier
(a)
[2 points] Given the following probabilities:
 P(Good course | Desmond is in the course) = 0.5
 P(Good course | Desmond is not in the course) = 0.3
 P(Desmond in a randomly chosen course) = 0.1
What is P(Desmond is in the course | Not a good course)? If your answer is not an
integer, write your answer in fraction form (use / to separate your numerator and de-
nominator), e.g. 1/2.
(b)
[7 points] Suppose you are given the following set of data with three Boolean input
variables A, B, C, and a single Boolean output variable D.
A
B
C
D
(i) According to the Naive Bayes classifier, what is P(D = 1|A = 1, B = 1, C = 0)? If
your answer is not an integer, write your answer in fraction form (use / to separate
your numerator and denominator), e.g. 1/2.
(ii) According to the Naive Bayes classifier, what is P(D = 0|A = 1, B = 1)? If your
answer is not an integer, write your answer in fraction form (use / to separate your
numerator and denominator), e.g. 1/2.
(iii) According to the general Bayes classifier (i.e. without independence assumption),
what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your
answer in fraction form (use / to separate your numerator and denominator), e.g.
1/2.
(iv) According to the general Bayes classifier (i.e. without independence assumption),
what is P(D = 1|A = 1, B = 1)? If your answer is not an integer, write your answer
in fraction form (use / to separate your numerator and denominator), e.g. 1/2.
(c)
[2 points] The Naive Bayes algorithm selects the class c for an example x that maxi-
mizes P(c|x). Suppose one of your classmates stated that it is equivalent to selecting the
c that maximizes P(x|c) under an assumption. What is the assumption that he has made?', 11, 7, NULL::jsonb, NULL, NULL, 'Problem 3 [11 points] Conditional Probability and Bayes Classifier
(a)
[2 points] Given the following probabilities:
 P(Good course | Desmond is in the course) = 0.5
 P(Good course | Desmond is not in the course) = 0.3
 P(Desmond in a randomly chosen course) = 0.1
What is P(Desmond is in the course | Not a good course)? If your answer is not an
integer, write your answer in fraction form (use / to separate your numerator and de-
nominator), e.g. 1/2.
Answer:
Let D be Desmond is in the course, G be a good course
P(D|NOT G) = P(D, NOT G)
P(Not G)
=
P(Not G|D)P(D)
P(Not G|D)P(D) + P(Not G|Not D)P(Not D)
=
(1 −0.5) × 0.1
(1 −0.5) × 0.1 + (1 −0.3) × (1 −0.1)
= 5
Marking scheme:
 2 points for giving the correct answer.
(b)
[7 points] Suppose you are given the following set of data with three Boolean input
variables A, B, C, and a single Boolean output variable D.
A
B
C
D
(i) According to the Naive Bayes classifier, what is P(D = 1|A = 1, B = 1, C = 0)? If
your answer is not an integer, write your answer in fraction form (use / to separate
your numerator and denominator), e.g. 1/2.
Answer:
P(D = 1|A = 1, B = 1, C = 0) =
P(D = 1)P(A = 1|D = 1)P(B = 1|D = 1)P(C = 0|D = 1)
P1
j=0 P(D = j)P(A = 1|D = j)P(B = 1|D = j)P(C = 0|D = j)
=
(4/8)(2/4)(1/4)(2/4)
(4/8)(2/4)(2/4)(1/4) + (4/8)(2/4)(1/4)(2/4) = 1
Marking scheme:
 1.75 points for giving the correct answer.
(ii) According to the Naive Bayes classifier, what is P(D = 0|A = 1, B = 1)? If your
answer is not an integer, write your answer in fraction form (use / to separate your
numerator and denominator), e.g. 1/2.
Answer:
P(D = 0|A = 1, B = 1) =
P(D = 0)P(A = 1|D = 0)P(B = 1|D = 0)
P1
j=0 P(D = j)P(A = 1|D = j)P(B = 1|D = j)
=
(4/8)(2/4)(2/4)
(4/8)(2/4)(2/4) + (4/8)(2/4)(1/4) = 2
Marking scheme:
 1.75 points for giving the correct answer.
(iii) According to the general Bayes classifier (i.e. without independence assumption),
what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your
answer in fraction form (use / to separate your numerator and denominator), e.g.
1/2.
Answer:
P(D = 1|A = 1, B = 1, C = 0) = 0 as there is no data with A = 1, B = 1, C
= 0 and D = 1 in the dataset.
Marking scheme:
 1.75 points for giving the correct answer.
(iv) According to the general Bayes classifier (i.e. without independence assumption),
what is P(D = 1|A = 1, B = 1)? If your answer is not an integer, write your answer
in fraction form (use / to separate your numerator and denominator), e.g. 1/2.
Answer:
P(D = 1|A = 1, B = 1) = 1/2 as number of data with A = 1, B = 1, D = 1
is 1, and number of data with A = 1, B = 1 is 2.
Marking scheme:
 1.75 points for giving the correct answer.
(c)
[2 points] The Naive Bayes algorithm selects the class c for an example x that maxi-
mizes P(c|x). Suppose one of your classmates stated that it is equivalent to selecting the
c that maximizes P(x|c) under an assumption. What is the assumption that he has made?
Answer:
P(c|x) = P(x|c)P(c)
P(x)
, so finding the c that maximizes P(c|x) is equivalent to finding c
that maximizes P(x|c), if the prior P(c) is uniform.
Marking scheme:
 2 points for stating the assumption correctly.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors
(a) [3 points] Consider a set of 5 training data given as ((xTrain
, xTrain
), cTrain) values, where
xTrain
and xTrain
are the two attribute values (positive integers) and cTrain is the binary
class label, A or B:
{ ((6,7),A), ((4,8),B), ((6,5),B), ((7,9),A), ((2,4),A) }
Classify a test example (xTest
, xTest
) with attribute values (4,7) using a KNN classifier
with K = 3 and Manhattan distance defined by
distance(xTrain, xTest) = P2
i=1 |xTrain
i
−xTest
i
|
where | · | denote absolute value.
Complete the following table by filling in the computed Manhattan distance between
each training data point and the test example. Determine the class label based on the
results.
xTrain
xTrain
cTrain
Distance
A
B
B
A
A
(b) [6 points] Judge whether each of the following student’s claims is correct or not. Explain
why.
(i)
[3 points] A student claims that the results of a general KNN classifier that uses
Euclidean distance will change if we multiply all attribute values of each training
and test data point by 0.5.
(ii) [3 points] Another student claims that the classification accuracy of the training set
will always increase if the value of K used in KNN classifier is incrementally increased
from 1 to N, where N is the total number of training examples.
(c) [5 points] Consider KNN using Euclidean distance on the following data set. Each point
belongs to one of the two classes: + and x.
(i)
[2 points] Perform 10-fold cross validation on the given data set (i.e. the 10 data
points as shown in the figure), what is the validation error when using 1-nearest
neighbor?
(ii) [3 points] Which of the following values of K leads to the minimum 10-fold cross vali-
dation error: 3, 5 or 9? What is the error for that K? If there is a tie, please elaborate.', 14, 9, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors
(a) [3 points] Consider a set of 5 training data given as ((xTrain
, xTrain
), cTrain) values, where
xTrain
and xTrain
are the two attribute values (positive integers) and cTrain is the binary
class label, A or B:
{ ((6,7),A), ((4,8),B), ((6,5),B), ((7,9),A), ((2,4),A) }
Classify a test example (xTest
, xTest
) with attribute values (4,7) using a KNN classifier
with K = 3 and Manhattan distance defined by
distance(xTrain, xTest) = P2
i=1 |xTrain
i
−xTest
i
|
where | · | denote absolute value.
Complete the following table by filling in the computed Manhattan distance between
each training data point and the test example. Determine the class label based on the
results.
xTrain
xTrain
cTrain
Distance
A
B
B
A
A
Answer:
xTrain
xTrain
cTrain
Distance
A
B
B
A
A
The predicted class label of the test example is B.
Marking scheme:
 0.5 point for giving each correct distance value. 2.5 points in total.
 0.5 point for giving the correct class label.
(b) [6 points] Judge whether each of the following student’s claims is correct or not. Explain
why.
(i)
[3 points] A student claims that the results of a general KNN classifier that uses
Euclidean distance will change if we multiply all attribute values of each training
and test data point by 0.5.
Answer:
The claim is false, because K nearest neighbors will remain unchanged after multi-
plying all attribute values of each training and test data points by 0.5.
Marking scheme:
 1 point for stating the claim is false.
 2 points for giving the correct explanation.
Remark:
 1 point given if stating the claim is correct but did mention that the change on
distance will be a multiplication of constant to the old distance AND didn’t state
result of KNN classifier will change.
 2 points given if stating the claim is correct but mention the change on distance
will be a multiplication and state the result wont change.
 0 point if correct deduction but draw the conclusion that result will change.
 1 point for explanation if treat the question as asking multiply the values on
Manhattan and answered correctly.
 0 point for explanation if treat the question as asking comparison between Man-
hattan and euclidean distance.
(ii) [3 points] Another student claims that the classification accuracy of the training set
will always increase if the value of K used in KNN classifier is incrementally increased
from 1 to N, where N is the total number of training examples.
Answer:
The claim is false. A counterexample is as follows:
The training set accuracy when K = 1 will be 100%.
As K approaches the total number of training examples more and more examples
influence the class, and eventually the class will always be the majority class in the
training set.
Marking scheme:
 1 point for stating the claim is false.
 2 points for giving the correct explanation.
Remarks:
 Give full mark if they misregard outliers as further neighbors.
 -1 point if they mention is due to outliers but didn’t explain what is regarded as
outliers.
 -2 points if they really mean outliers.
 -1 point if they only state larger k will cover more neighbors (this is just explain-
ing what does a larger k means but not explaining why larger k can lower the
accuracy) (*further neighbors is accepted but more neighbors is not)
 Other accept answers:
i. Larger k include further neighbors which have low relevancy.
ii. Underfitting
iii. Giving a concrete situation that the claim is wrong.
(c) [5 points] Consider KNN using Euclidean distance on the following data set. Each point
belongs to one of the two classes: + and x.
(i)
[2 points] Perform 10-fold cross validation on the given data set (i.e. the 10 data
points as shown in the figure), what is the validation error when using 1-nearest
neighbor?
Answer:
Every point is misclassified. So, the validation error is 10/10.
Marking scheme:
 2 points for giving the correct validation error.
Note: Both 10/10 and 100% are accepted as the correct answers.
(ii) [3 points] Which of the following values of K leads to the minimum 10-fold cross vali-
dation error: 3, 5 or 9? What is the error for that K? If there is a tie, please elaborate.
Answer:
All 3 values of K mis-classify all of the points and have the same classification errors,
10/10.
Marking scheme:
 2 points for stating all 3 values of K have the same classification errors.
 1 point for giving the correct error.
Note: Both 10/10 and 100% are accepted as the correct answers.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [12 points] K-Means Clustering
Given a 1-dimensional data set {0, 3, 6, 9, 27, 30}, use the K-means clustering algorithm and
Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3
and c2 = 4 are chosen as the initial cluster centers.
(a) [4.5 points] Perform one iteration of K-means clustering by finding the Euclidean distance
between each data point in the data set and the centroids, and assign each data point to
the closest centroid according to the distance found. Fill in the following table with your
computed values. If your distance value is not an integer, write your answer in decimal
form, e.g. 1.234.
Data Point
Distance between the
data point and c1 = 3
Distance between the
data point and c2 = 4
Closest Centroid
(c1 or c2?)
(b)
[1.5 points] What are the values of c1 and c2 after one iteration of K-means? If your
answer is not an integer, write your answer in decimal form, e.g. 1.234.
(c) [4.5 points] Perform the second iteration of K-means clustering by finding the Euclidean
distance between each data point in the data set and the computed centroids in part (b),
and assign each data point to the closest centroid according to the distance found. Fill in
the following table with your computed values. If your distance value is not an integer,
write your answer in decimal form, e.g. 1.234.
Data Point
Distance between the
data point and the c1
computed in part (b)
Distance between the
data point and the c2
computed in part (b)
Closest Centroid
(c1 or c2?)
(d)
[1.5 points] What are the values of c1 and c2 after the second iteration of K-means? If
your answer is not an integer, write your answer in decimal form, e.g. 1.234.', 12, 11, NULL::jsonb, NULL, NULL, 'Problem 5 [12 points] K-Means Clustering
Given a 1-dimensional data set {0, 3, 6, 9, 27, 30}, use the K-means clustering algorithm and
Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3
and c2 = 4 are chosen as the initial cluster centers.
(a) [4.5 points] Perform one iteration of K-means clustering by finding the Euclidean distance
between each data point in the data set and the centroids, and assign each data point to
the closest centroid according to the distance found. Fill in the following table with your
computed values. If your answer is not an integer, write your answer in decimal form,
e.g. 1.234.
Data Point
Distance between the
data point and c1 = 3
Distance between the
data point and c2 = 4
Closest Centroid
(c1 or c2?)
Answer:
Data Point
Distance between the
data point and c1 = 3
Distance between the
data point and c2 = 4
Closest Centroid
(c1 or c2?)
c1
c1
c2
c2
c2
c2
Marking scheme:
 0.25 for giving each correct value. 4.5 points in total.
(b)
[1.5 points] What are the values of c1 and c2 after one iteration of K-means? If your
answer is not an integer, write your answer in decimal form, e.g. 1.234.
Answer:
c1 = 0 + 3
= 1.5
c2 = 6 + 9 + 27 + 30
= 18
Marking scheme:
 0.75 for giving each correct centroid. 1.5 points in total.
(c) [4.5 points] Perform the second iteration of K-means clustering by finding the Euclidean
distance between each data point in the data set and the computed centroids in part (b),
and assign each data point to the closest centroid according to the distance found. Fill
in the following table with your computed values. If your answer is not an integer, write
your answer in decimal form, e.g. 1.234.
Data Point
Distance between the
data point and the c1
computed in part (b)
Distance between the
data point and the c2
computed in part (b)
Closest Centroid
(c1 or c2?)
Answer:
Data Point
Distance between the
data point and the c1
computed in part (b)
Distance between the
data point and the c2
computed in part (b)
Closest Centroid
(c1 or c2?)
1.5
c1
1.5
c1
4.5
c1
7.5
c1
25.5
c2
28.5
c2
Marking scheme:
 0.25 for giving each correct value. 4.5 points in total.
(d)
[1.5 points] What are the values of c1 and c2 after the second iteration of K-means? If
your answer is not an integer, write your answer in decimal form, e.g. 1.234.
Answer:
c1 = 0 + 3 + 6 + 9
= 4.5
c2 = 27 + 30
= 28.5
Marking scheme:
 0.75 for giving each correct centroid. 1.5 points in total.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [20 points] Perceptron
Given the following training dataset:
x1
0.5
x2
0.5
T
-1
-1
-1
(a)
[8 points] Show the action of the perceptron algorithm for the above sequence of data
points by completing the following table. Assume η = 1 and we start with the following
initial weights and bias
w1 = 1
w2 = 1
θ = 0
and use the following activation function.
f(z) =



z ≥0
−1
otherwise
Updating rules:
∆wi = η(T −O)xi
∆θ = η(T −O)
wi = wi + ∆wi
θ = θ + ∆θ
where i ∈{1, 2}.
If your answer is not an integer, write your answer in decimal form, e.g. 1.234.
x1
x2
T
O
∆w1
w1
∆w2
w2
∆θ
θ
-
-
-
-
-
-
-
(b)
[2 points] According to the values in the table above, state whether the perceptron
algorithm is converged in 1 epoch. If not, explain why.
(c) [10 points] Write a Python program to verify your answers of Part (a). In your program,
you need to define the following variables.
 A 2D NumPy array, X, to store all the attribute values x1, x2, where the shape is
(8,2)
 A 1D NumPy array, T,to store the target values, where the shape is (8,)
 A 1D NumPy array, W, to store the weights, where the shape is (2,)
 A float bias value, b.
and perform the required computations. Also, print the following sequence of values (in
exact order) for each iteration:
x1 < s > x2 < s > T < s > O < s > ∆w1 < s > w1 < s > ∆w2 < s > w2 < s > ∆θ < s > θ < end >
where < s > refers to an empty space, and < end > refers to an end of line character.
The following shows a line of sample output.
0 0 0 0 0 0 0 0 0 0
Remark: You cannot use any other libraries other than NumPy in your program.', 20, 12, NULL::jsonb, NULL, NULL, 'Problem 6 [20 points] Perceptron
Given the following training dataset:
x1
0.5
x2
0.5
T
-1
-1
-1
(a)
[8 points] Show the action of the perceptron algorithm for the above sequence of data
points by completing the following table. Assume η = 1 and we start with the following
initial weights and bias
w1 = 1
w2 = 1
θ = 0
and use the following activation function.
f(z) =



z ≥0
−1
otherwise
Updating rules:
∆wi = η(T −O)xi
∆θ = η(T −O)
wi = wi + ∆wi
θ = θ + ∆θ
where i ∈{1, 2}.
If your answer is not an integer, write your answer in decimal form, e.g. 1.234.
x1
x2
T
O
∆w1
w1
∆w2
w2
∆θ
θ
-
-
-
-
-
-
-
Answer:
x1
x2
T
O
∆w1
w1
∆w2
w2
∆θ
θ
-
-
-
-
-
-
-
-1
-2
-2
-2
-1
-6
-5
-6
-5
-2
-4
-1
-2
0.5
0.5
-1
-1
-1
-2
-4
-4
-4
Marking scheme:
 0.1 point for giving each correct value. 8 points in total.
(b)
[2 points] According to the values in the table above, state whether the perceptron al-
gorithm is converged in 1 epoch. If not, explain why.
Answer:
The algorithm is not converged in 1 epoch.
Since there are changes of weights and
biases.
Marking scheme:
 1 point for stating the algorithm is not converged.
 1 point for giving a correct explanation.
(c) [10 points] Write a Python program to verify your answers of Part (a). In your program,
you need to define the following variables
 A 2D NumPy array, X, to store all the attribute values x1, x2, where the shape is
(8,2)
 A 1D NumPy array, T,to store the target values, where the shape is (8,)
 A 1D NumPy array, W, to store the weights, where the shape is (2,)
 A float bias value, b.
and perform the required computations. Also, print the following sequence of values (in
exact order) for each iteration:
x1 < s > x2 < s > T < s > O < s > ∆w1 < s > w1 < s > ∆w2 < s > w2 < s > ∆θ < s > θ < end >
where < s > refers to an empty space, and < end > refers to an end of line character.
The following shows a line of sample output.
0 0 0 0 0 0 0 0 0 0
Remark: You cannot use any other libraries other than NumPy in your program.
Answer:
import numpy as np
# 0.5 point
X = np.array([[10,10], [0,0], [8,4], [3,3], [4,8], [0.5,0.5], [4,3], [2,5]])
# 1 point
T = np.array([1,-1,1,-1,1,-1,1,1])
# 0.5 point
W = np.array([1,1], dtype=float)
# 1 point
b = 0.0
# 0.5 point
for i in range(X.shape[0]):
# 1 point
y = X[i].dot(W) + b
# 1 point
if y >= 0:
# 0.5 point
O = 1
# 0.5 point
else:
O = -1
# 0.5 point
DW = (T[i] - O) * X[i]
# 0.5 point
W += DW
# 0.5 point
Db = (T[i] - O)
# 0.5 point
b += Db
# 0.5 point
print(X[i][0], X[i][1], T[i], O, DW[0], W[0], DW[1], W[1], Db, b)
# 1 point
Remarks:
 -0.25 point for forgetting to specify W = np.array([1,1], dtype=float)
 Can also b = 0, Python will duck-type into float when needed.
 -0.25 point each for not using NumPy array. No overlap with W float penalty, since
Python list supports duck-typing.
 -0.25 point each for wrong array shape.
 -0.5 point for hard-coding for i in range(8):.
 -0.25 point for forgetting bias in the output calculation.
 -0.25 point for minor print formatting errors.
 -0.25 point for incorrect print when output == target.
 -0.25 point each for miscellaneous syntax errors.
 -1 flat point for defining as class/function(s) but not calling.
 -10 floor point for using SKlearn.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2022-spring-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [12 points] Perceptron and Multilayer Perceptron
(a)
[4 points] Can we represent the given boolean function with a single neuron as shown
below?
A
B
f(A,B)
If yes, show the possible weights, bias and activation function that can be used to com-
pute the output of all the data points correctly. If not, explain why not in 1-2 sentences.
(b) [6 points] Suppose we have a neural network as shown below with θ1 = 0, θ2 = 0, θ3 = 0,
and linear activation function (i.e. f(x) = Cx, where C is a constant).
Can any function that is represented by the above network be represented by a single
unit of artificial neural network in the following diagram? If so, detail the weights (w7
and w8), bias (θ), and the activation function f(x). Otherwise, explain why not.
(c)
[2 points] Given a multilayer perceptron with 3 layers (input layer, hidden layer and
output layer). The number of units in each of the these layers are 3, 4, 2. Assume each
input or neuron is fully-connected. Calculate the number of trainable parameters of this
network.
-------------------- END OF PAPER
--------------------', 12, 14, NULL::jsonb, NULL, NULL, 'Problem 7 [12 points] Perceptron and Multilayer Perceptron
(a)
[4 points] Can we represent the given boolean function with a single neuron as shown
below?
A
B
f(A,B)
If yes, show the possible weights, bias and activation function that can be used to com-
pute the output of all the data points correctly. If not, explain why not in 1-2 sentences.
Answer:
Yes, we can represent this function with a single neuron, since it is linearly separable.
One set of possible weights and bias is: w1 = 1, w2 = −1, θ = −0.5, and the activation
function is
f(x) =



x > 0
otherwise
Marking scheme:
 1 point for stating it is possible to represent the given boolean function with a single
neuron.
 1 point for showing the “standard” activation function.
 2 points for correct values of w1, w2, and θ, given the activation function is the
“standard” one.
 If the given activation function is not “standard”, but the parameters are suitable,
also get 2 points.
 0 point for stating not possible to represent.
(b) [6 points] Suppose we have a neural network as shown below with θ1 = 0, θ2 = 0, θ3 = 0,
and linear activation function (i.e. f(x) = Cx, where C is a constant).
Can any function that is represented by the above network be represented by a single
unit of artificial neural network in the following diagram? If so, detail the weights (w7
and w8), bias (θ), and the activation function f(x). Otherwise, explain why not.
Answer:
Yes, the network can be represented by a single unit of artificial neural network by
setting its weights, w7 = w1w5 + w3w6, w8 = w2w5 + w4w6, and the activation function
f(x) = C2.
Marking scheme:
 0.5 point for stating any function that is represented by the MLP can be represented
by a single unit of artificial neural network.
 1.5 points for showing each possible weight w7, w8. 3 points in total.
 1 point for showing a possible bias value, i.e. 0.
 1.5 for showing the activation function.
Remarks:
 If stating NO, -6 points.
But among those NO’s, if students put the expression
OUTPUT=C2w5(x1w1 + x2w2 + θ1) + C2w6(x1w3 + x2w4 + θ2) + Cθ3, give 1 point.
 For those stating YES, if missing some value, deduct the score of that value.
 If the order of C is wrong, e.g. one C in w7w8 and no C in activation, or C2 in w7w8
and one C in activation, -1 point.
 For other wrong values, -full mark for that values.
 If didn’t plug in given values and the expression is wrong, even if the expression may
evaluate to the same value as the answer, -full mark for that expression or -1 point
if it’s an order-of-C issue.
 If didn’t plug in given values, e.g. θs=0, the same C for all units (some use C1C2C3),
but otherwise correct, -0.5 for each of θ & C.
(c)
[2 points] Given a multilayer perceptron with 3 layers (input layer, hidden layer and
output layer). The number of units in each of the these layers are 3, 4, 2. Assume each
input or neuron is fully-connected. Calculate the number of trainable parameters of this
network.
Answer:
3 ∗4 + 4 + 4 ∗2 + 2 = 26.
Marking scheme:
 2 points for giving the correct answer.
-------------------- END OF PAPER
--------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-final-part-a', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 1 point for each correct answer.
(a) One drawback of the K-means algorithm is that one needs to specify exactly how many
clusters the algorithm should find.
(b) Increasing the number of hidden layers always increases the model performance.
(c) When handling a binary classification task, both softmax and sigmoid functions can be
used as the activation function in the output layer.
(d) Validation accuracy must be lower than training accuracy.
(e) We cannot train/inference deep learning networks using CPU.
(f) Otsu’s thresholding method and affine transformations are point-based image operations.
(g) In the convolutional layer of a CNN, the number of weights depends on the depth of the
input volume and the number of biases is equal to the number of kernels.
(h) After training a neural network, you observe a large gap between the training accuracy
(100%) and the task accuracy (40%). Dropout is commonly used to reduce this gap.
(i) In a minimax-based 3×3 tic-tac-toe game, an AI player will definitely win because it
knows all possible moves of the game.
(j) The alpha-beta pruning algorithm is preferred to minimax because it computes the same
answer as minimax while usually doing so without examining as much of the game tree.
Question
Answer (T/F)
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)', 10, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 1 point for each correct answer.
(a) One drawback of the K-means algorithm is that one needs to specify exactly how many
clusters the algorithm should find.
(b) Increasing the number of hidden layers always increases the model performance.
(c) When handling a binary classification task, both softmax and sigmoid functions can be
used as the activation function in the output layer.
(d) Validation accuracy must be lower than training accuracy.
(e) We cannot train/inference deep learning networks using CPU.
(f) Otsu’s thresholding method and affine transformations are point-based image operations.
(g) In the convolutional layer of a CNN, the number of weights depends on the depth of the
input volume and the number of biases is equal to the number of kernels.
(h) After training a neural network, you observe a large gap between the training accuracy
(100%) and the task accuracy (40%). Dropout is commonly used to reduce this gap.
(i) In a minimax-based 3×3 tic-tac-toe game, an AI player will definitely win because it
knows all possible moves of the game.
(j) The alpha-beta pruning algorithm is preferred to minimax because it computes the same
answer as minimax while usually doing so without examining as much of the game tree.
Answer:
Question
Answer (T/F)
(a)
T
(b)
F
(c)
T
(d)
F
(e)
F
(f)
F
(g)
T
(h)
T
(i)
F
(j)
T
Marking scheme:
 1 point for giving each correct answer. 10 points in total.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-final-part-a', '2', NULL, 2, 'long_question', 'long_answer', 'Problem 2 [14 points] Na¨ıve Bayes and K-Nearest Neighbors
Given the training data in the table below.
No.
CGPA
Interest in
Computing Subjects
Practice-oriented
Learner?
COMP 2211
Grade
Select COMP
as Major
≤3
High
No
B
No
≤3
High
No
A
No
> 3 AND ≤4
High
No
B
Yes
> 4
Medium
No
B
Yes
> 4
Low
Yes
B
Yes
> 4
Low
Yes
A
No
> 3 AND ≤4
Low
Yes
A
Yes
≤3
Medium
No
B
No
≤3
Low
Yes
B
Yes
> 4
Medium
Yes
B
Yes
≤3
Medium
Yes
A
Yes
> 3 AND ≤4
Medium
No
A
Yes
> 3 AND ≤4
High
Yes
B
Yes
> 4
Medium
No
A
No
(a)
[6.5 points] Given a new example with the following attribute values. Predict the value
of its “Select COMP as Major” using Na¨ıve Bayes classifier. Show all the steps.
 CGPA ≤3
 Interest in Computing Subjects = Medium
 Practice-oriented Learner = Yes
 COMP 2211 Grade = B
(b)
[7.5 points] Similar to the above, but this time, predict the value of its “Select COMP
as Major” using K-nearest neighbor for K = 5. Complete the following table and state
the prediction result based on the data in the completed table. For similarity measure,
use a simple match of attribute values:
S(ai, bi) =
X
i=1
wi ∗distance(ai, bi)
where distance(ai, bi) is 0 if ai equals bi, and 1 otherwise. ai and bi are either CGPA,
interest in computing subjects, practice-oriented learner or COMP 2211 grade. Weights,
wi, are all 1 except for interest in computing subjects, it is 2.
No.
Class
Distance to New Example
No
No
Yes
Yes
Yes
No
Yes
No
Yes
Yes
Yes
Yes
Yes
No', 14, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [14 points] Na¨ıve Bayes and K-Nearest Neighbors
Given the training data in the table below.
No.
CGPA
Interest in
Computing Subjects
Practice-oriented
Learner?
COMP 2211
Grade
Select COMP
as Major
≤3
High
No
B
No
≤3
High
No
A
No
> 3 AND ≤4
High
No
B
Yes
> 4
Medium
No
B
Yes
> 4
Low
Yes
B
Yes
> 4
Low
Yes
A
No
> 3 AND ≤4
Low
Yes
A
Yes
≤3
Medium
No
B
No
≤3
Low
Yes
B
Yes
> 4
Medium
Yes
B
Yes
≤3
Medium
Yes
A
Yes
> 3 AND ≤4
Medium
No
A
Yes
> 3 AND ≤4
High
Yes
B
Yes
> 4
Medium
No
A
No
(a)
[6.5 points] Given a new example with the following attribute values. Predict the value
of its “Select COMP as Major” using Na¨ıve Bayes classifier. Show all the steps.
 CGPA ≤3
 Interest in Computing Subjects = Medium
 Practice-oriented Learner = Yes
 COMP 2211 Grade = B
Answer: Let
 E be CGPA ≤3, Interest in Computing Subjects = Medium, Practice-oriented
Learner = Yes, COMP 2211 Grade = B
 E1 be CGPA ≤3
 E2 be interest in computing subjects = medium
 E3 be practice-oriented learner = yes
 E4 be COMP 2211 grade = B
P(Y es|E) = P(E1|Y es)P(E2|Y es)P(E3|Y es)P(E4|Y es)P(Y es)
P(E)
P(Y es) = 9/14 = 0.643
P(E1|Y es) = 2/9 = 0.222
P(E2|Y es) = 4/9 = 0.444
P(E3|Y es) = 6/9 = 0.667
P(E4|Y es) = 6/9 = 0.667
P(Y es|E) = (0.222)(0.444)(0.667)(0.668)(0.443)
P(E)
= 0.028
P(E)
P(No|E) = P(E1|No)P(E2|No)P(E3|No)P(E4|No)P(No)
P(E)
P(No) = 5/14 = 0.356
P(E1|No) = 3/5 = 0.6
P(E2|No) = 2/5 = 0.4
P(E3|No) = 1/5 = 0.2
P(E4|No) = 2/5 = 0.4
P(No|E) = (0.6)(0.4)(0.2)(0.4)(0.357)
P(E)
= 0.007
P(E)
Hence, the Na¨ıve Bayes classifier predicts “Select COMP as Major” = Yes for the new
example.
Marking scheme:
 0.5 for giving each conditional and prior probability. 6 points in total.
 0.5 for giving the correct prediction.
(b)
[7.5 points] Similar to the above, but this time, predict the value of its “Select COMP
as Major” using K-nearest neighbor for K = 5. Complete the following table and state
the prediction result based on the data in the completed table. For similarity measure,
use a simple match of attribute values:
S(ai, bi) =
X
i=1
wi ∗distance(ai, bi)
where distance(ai, bi) is 0 if ai equals bi, and 1 otherwise. ai and bi are either CGPA,
interest in computing subjects, practice-oriented learner or COMP 2211 grade. Weights,
wi, are all 1 except for interest in computing subjects, it is 2.
No.
Class
Distance to New Example
No
0 + 2 + 1 + 0 = 3
No
0 + 2 + 1 + 1 = 4
Yes
1 + 2 + 1 + 0 = 4
Yes
1 + 0 + 1 + 0 = 2
Yes
1 + 2 + 0 + 0 = 3
No
1 + 2 + 0 + 1 = 4
Yes
1 + 2 + 0 + 1 = 4
No
0 + 0 + 1 + 0 = 1
Yes
0 + 2 + 0 + 0 = 2
Yes
1 + 0 + 0 + 0 = 1
Yes
0 + 0 + 0 + 1 = 1
Yes
1 + 0 + 1 + 1 = 3
Yes
1 + 2 + 0 + 0 = 3
No
1 + 0 + 1 + 1 = 3
Among the 5 nearest neighbors, 4 are from class Yes, and 1 from class No. Hence, the
KNN classifier predicts “Select COMP as Major” = Yes for the new example.
Marking scheme:
 0.5 point for giving each correct answer. 7 points in total.
 0.5 point for giving the correct prediction.', ARRAY['Probabilistic Models', 'KNN and Clustering']::TEXT[], 'Probabilistic Models', NULL, ARRAY['Probabilistic Models', 'KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-final-part-a', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [14 points] Multilayer Perceptron (MLP)
This problem is about multilayer perceptron (MLP). Answer all the questions below.
(a) [3 points] State when using the F1 metric is better than using accuracy as an evaluation
metric. Also use a confusion matrix to illustrate the stated situation.
(b)
[2 points] Suppose we are dealing with binary classification tasks using MLP. Explain
why it is inappropriate to use ReLU as the activation function in the output layer.
(c)
[2 points] Design the output layer of an MLP for handling a multilabel classification
problem with n classes by stating the number of neurons and the activation function.
Remark: Multilabel classification is a supervised learning problem where an instance
may be associated with multiple labels.
(d)
[3 points] Explain why it is not good to initialize all weights of an MLP to zero.
Hint: Refer to the following updating rules of weights and biases for MLP.
 δk = (Ok −Tk)Ok(1 −Ok)
 δj = Oj(1 −Oj) P
k∈K δkwjk
 wjk ←wjk −ηδkOj
 wij ←wij −ηδjOi
 θj ←θj −ηδj
 θk ←θk −ηδk
(e)
[2 points] Explain what will happen if the learning rate η of an MLP is
(i) Too large
(ii) Too small
(f)
[2 points] Describe a way to avoid overfitting in MLP. Explain why it works.', 14, 6, NULL::jsonb, NULL, NULL, 'Problem 3 [14 points] Multilayer Perceptron (MLP)
This problem is about multilayer perceptron (MLP). Answer all the questions below.
(a) [3 points] State when using the F1 metric is better than using accuracy as an evaluation
metric. Also use a confusion matrix to illustrate the stated situation.
Answer:
 When the dataset is unbalanced.
 When false-negative and false-positive matter a lot.
Actual/Predicted
Infectious disease=yes
Infectious disease=no
Infectious disease=yes
Infectious disease=no
Accuracy = 0.8016
F1 score = 0.0741
Marking scheme:
 1 point for stating the situation
 2 points for the confusion matrix
(b)
[2 points] Suppose we are dealing with binary classification tasks using MLP. Explain
why it is inappropriate to use ReLU as the activation function in the output layer.
Answer:
We cannot determine the cut-off threshold to distinguish between the output classes when
there is an unbounded output range.
Marking scheme:
 2 points for explaining by the “unbounded” range of ReLU so cannot determine the
cut-off
 1 point if mentioning the range of function (*not describing what ReLU do, but
stating the range) but no further explanation
 0 point if only stating ReLU can output value more than 0 & 1 without mentioning it
has an unbounded range (bounded function beyond the range from 0 to 1 can work,
just do the mapping)
(c)
[2 points] Design the output layer of an MLP for handling a multilabel classification
problem with n classes by stating the number of neurons and the activation function.
Remark: Multilabel classification is a supervised learning problem where an instance
may be associated with multiple labels.
Answer:
n neurons and sigmoid function.
Marking scheme:
 1 point for n neurons
 1 point for sigmoid
(d)
[3 points] Explain why it is not good to initialize all weights of an MLP to zero.
Hint: Refer to the following updating rules of weights and biases for MLP.
 δk = (Ok −Tk)Ok(1 −Ok)
 δj = Oj(1 −Oj) P
k∈K δkwjk
 wjk ←wjk −ηδkOj
 wij ←wij −ηδjOi
 θj ←θj −ηδj
 θk ←θk −ηδk
Answer:
If a network is initialized with all zeros, all the neurons will propagate on the same
gradient, making different neurons learn the same features. Thus, this leads to poor
performance.
Marking scheme:
 3 points if “the same update/propagate/feature learned/symmetry problem”
 2 points if “zero updates” as it isn’t always true for all MLP design
 2 points if stating only gradient computation are the same for neurons
 1 point if only mentioning deltaj will become zero / stating gradient “may” not
update
 0 point for low efficiency / poor performance
(e)
[2 points] Explain what will happen if the learning rate η of an MLP is
(i) Too large
(ii) Too small
Answer:
(i)
 Cause the model to coverage too quickly to a sub-optimal solution.
 Unstable training like oscillations.
Marking scheme:
 1 point for explaining what will happen if the learning rate is too large.
 not accept learn faster/overfit/low accuracy
(ii) Learning will be slow.
Marking scheme:
 1 point for explaining what will happen if the learning rate is too small.
 not accept underfit
(f)
[2 points] Describe a way to avoid overfitting in MLP. Explain why it works.
Answer:
Adding regulations helps to keep the weights small, such that it is less likely for the
model to have a large variance (i.e. be sensitive to noise and fluctuations in data).
Marking scheme:
 1 point for method
 1 point for explanation
 only valid explanation (but not stating the definition or recalling some rule of thumbs)
get the explanation point', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-final-part-a', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [15 points] Digital Image Processing
(a) Assume we apply the following kernel to an 8-bit grayscale image.
K =


−1
−2
−2


(i)
[2 points] Determine the maximum and minimum possible values that a pixel to
which the given kernel is applied can have. Do not perform any normalization.
(ii)
[2 points] Suggest a grey-level transformation function to ensure that any output of
this kernel will be within the legal range of a standard 8-bit grayscale image.
(b) Consider the following 2 × 2 image:
Apply the following image operations sequentially.
(i)
[3 points] Show the resulting image of size 4 × 4 after adding reflection padding.
(ii) [4 points] Apply a 3×3 averaging kernel to the resulting image of part (b)(i). Assume
the output image after image averaging is in the same shape as the input image by
doing zero padding. Round the number to integer if needed.
(iii) [4 points] Compute the optimal threshold after applying ONE ITERATION of Otsu’s
method on the resulting image of part (b)(ii). Assume the initial threshold is set to
the mean pixel intensity of the resulting image.', 15, 7, NULL::jsonb, NULL, NULL, 'Problem 4 [15 points] Digital Image Processing
(a) Assume we apply the following kernel to an 8-bit grayscale image.
K =


−1
−2
−2


(i)
[2 points] Determine the maximum and minimum possible values that a pixel to
which the given kernel is applied can have. Do not perform any normalization.
Answer:
Since an 8-bit grayscale image is assumed, the highest value that each pixel can
have is 255, and the lowest value is 0.
After applying the kernel, the maximum
value is achieved when the negative values of the kernel multiply 0s and the pos-
itive values of the kernel multiply 255s. Then, the maximum achievable value is
vmax = (2 + 2 + 1)(255) = 1275. Following the same reasoning, the minimum value
will be vmin = (−2 −2 −1)(255) = −1275.
Marking scheme:
 1 point for stating the maximum possible value.
 1 point for stating the minimum possible value.
(ii)
[2 points] Suggest a grey-level transformation function to ensure that any output of
this kernel will be within the legal range of a standard 8-bit grayscale image.
Answer:
A 8-bit image must have a range from 0 to 255. Since the maximum and minimum
possible values for the gray levels are vmax = 1275 and vmin = −1275, respectively,
the function will be
Ioutput = 255
Iinput −vmin
vmax −vmin

= 255
Iinput −(−1275)
1275 −(−1275)

= 255
Iinput + 1275

where Iinput and Ioutput are the input and output images, respectively.
Marking scheme:
 2 points for stating a transformation function.
 -1 point if the function is merely returning legal range.
(b) Consider the following 2 × 2 image:
Apply the following image operations sequentially.
(i)
[3 points] Show the resulting image of size 4 × 4 after adding reflection padding.
Answer:
Marking scheme:
 0.25 point for each correct value. 3 points in total.
(ii)
[4 points] Apply a 3 × 3 averaging kernel to the resulting image of part (b)(i). As-
sume the output image after image averaging is in the same shape as the input image
by doing zero padding. Round the number to integer if needed.
Answer:
Marking scheme:
 0.25 point for each correct value. 4 points in total.
(iii) [4 points] Compute the optimal threshold after applying ONE ITERATION of Otsu’s
method on the resulting image of part (b)(ii). Assume the initial threshold is set to
the mean pixel intensity of the resulting image.
Answer:
 Initial threshold = (0 + 2 + 4 + 4 + 4 + 9 + 12 + 10 + 8 + 15 + 18 + 14 + 8 + 14 +
16 + 12)/16 = 9.375
 µ1 = (0 + 2 + 4 + 4 + 4 + 9 + 8 + 8)/8 = 4.875
 µ2 = (12 + 10 + 15 + 18 + 14 + 14 + 16 + 12)/8 = 13.875
 Optimal threshold = (4.875 + 13.875) = 9.375
Marking scheme:
 1 point for the answer of initial threshold.
 1 point for the answer of µ1.
 1 point for the answer of µ2.
 1 point for the optimal threshold after 1 iteration.
 -1 point for incorrect input from part b(ii). I.e. incorrect calculation based on
result of part b(ii.)', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-final-part-b', '1', NULL, 1, 'long_question', 'long_answer', 'Problem 1 [19 points] Convolutional Neural Network (CNN)
(a)
[1.5 points] Suppose there is a convolutional layer in a CNN with the following parame-
ters.
 Kernel size: 3 × 3
 Zero padding: 1-pixel border on each side
 Stride: 2-pixel in each direction
Compute the output feature map of the convolutional layer with the following input im-
age and kernel. Assume the number of channels for both input and output images is 1.
Express your answer as a 2D nested list. E.g. [[2,2],[2,2]].
Input image:
Kernel:
(b)
[1.5 points] Give an example of a data augmentation technique that would be useful for
classifying images of cats and dogs, but not for classifying handwritten digits. Briefly
explain your answer.
(c) [4.5 points] Assume we have a CNN with the architecture as described in Table 1. Com-
plete the table by filling in the input and output shape of each layer in the format: ‘height
x width x channel’ (Note: A space is placed before and after the symbol ’x’). To illustrate
that, the input shape of the 1st convolutional layer has been inserted for you.
Formula:
Convolution output size
= [ (size of image dimension - size of kernel dimension + 2 × padding) / stride ] + 1
Layer (Size, Specifications)
Input Shape
Output Shape
7x7 conv, 64 kernels, stride 2, padding 3, with biases
224 x 224 x 3
3x3 max pooling, stride 2, padding 1
3x3 conv, 128 kernels, stride 2, with biases
3x3 conv, 256 kernels, stride 2, padding 1, with biases
3x3 conv, 512 kernels, stride 2, padding 1, with biases
Table 1: Architecture of a CNN
(d)
[2 points] Assume we want to add a fully-connected layer at the back of the CNN as
described in part (b). However, the output of the resulting network still contains a large
number of parameters. State what we should do if we want to reduce the number of
parameters and summarize the output.
Hint: We want to turn N × N × 512 output to 1 × 1 × 512.
Remark: Convolution layer is not an acceptable answer.
(e)
[2.5 points] Calculate the total number of parameters of the model? (i.e. for all the
layers described in Table 1).
(f) Assume the model should classify images into 1000 distinct classes by
(i) [2.5 points] Appending the network shown in Table 2 to the end of the CNN network
(i.e. the one described in Table 1) appended with the structure that you added in
part (c).
Layer (Size, Specifications)
Input Shape
Output Shape
Flatten
1 x 1 x 512
Fully-connected (Flatten before feed-in)
Table 2: Classification network
Calculate the total number of parameters in the whole network (i.e.
the layers
described in Table 1, your answer in part (c), and the layers described in Table 2).
(ii) [2.5 points] Now, suppose we use the MLP described in Table 3 to classify the images
instead of CNN.
Layer (Size, Specifications)
Input Shape
Output Shape
Flatten
224 x 224 x 3
Fully-connected (Flatten before feed-in)
Table 3: Classification using MLP
Calculate the total number of parameters in the MLP (i.e. only those described in
Table 3).
(g)
[2 points] State what you observe in part (e). Also, state the property of CNN, which
leads to this difference.', 19, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [19 points] Convolutional Neural Network (CNN)
(a)
[1.5 points] Suppose there is a convolutional layer in a CNN with the following parame-
ters.
 Kernel size: 3 × 3
 Zero padding: 1-pixel border on each side
 Stride: 2-pixel in each direction
Compute the output feature map of the convolutional layer with the following input im-
age and kernel. Assume the number of channels for both input and output images is 1.
Express your answer as a 2D nested list. E.g. [[2,2],[2,2]]. If your calculated answer
is a floating-point number, convert it to an integer using the floor function.
Input image:
Kernel:
Answer:
[[73]] for using the unflipped kernel OR
[[27]] for using the flipped kernel.
Marking scheme:
 Accepted answers:
[[73 (39, 43, 69)]], [[27 (61, 31, 57)]] (for flipped kernel)
-1 point if the format is wrong. e.g., missing bracket(s).
If there are more than 1 value, the correct answer will be instead a 2x2 array ex-
tracted from the following arrays. There are two possible solutions:
[A[1, 1], A[1, 2]], [A[2, 1], A[2, 2]]
[A[1, 1], A[1, 3]], [A[3, 1], A[3, 3]] (stride 2 with extended zero padding)
A = [[ 2, 16, 38, 28],
[ 5, 27, 61, 53],
[ 8, 31, 57, 60],
[ 3, 13, 21, 27]]
A = [[18, 44, 22, 12],
[25, 73, 39, 17],
[22, 69, 43, 10],
[ 7, 27, 19,
3]]
(this two array is generated by
scipy.signal.convolve2d(in1, in2, mode=''full'', boundary=''fill'', fillvalue=0))
There are two possible arrays because traditionally, the former one is the “real” con-
volution. I.e. the kernel or image is flipped along both axes before the convolving op-
erations. However, at the latter one (the originally intended answer), it’s calculated
WITHOUT the flipping. This operation is called the cross-correlation operation.
(b)
[1.5 points] Give an example of a data augmentation technique that would be useful for
classifying images of cats and dogs, but not for classifying handwritten digits. Briefly
explain your answer.
Answer:
Flipping the image horizontally. Doing this to a dog/cat image would be reasonable,
but not so for an image of a handwritten digit.
Marking scheme:
 2 points for “rotate and reflections” as augmentation methods.
 0 point for image processing methods.
(c) [4.5 points] Assume we have a CNN with the architecture as described in Table 1. Com-
plete the table by filling in the input and output shape of each layer in the format: ‘height
x width x channel’ (Note: A space is placed before and after the symbol ’x’). To illustrate
that, the input shape of the 1st convolutional layer has been inserted for you.
Formula:
Convolution output size
= [ (size of image dimension - size of kernel dimension + 2 × padding) / stride ] + 1
If your calculated answer is a floating-point number, convert it to an integer using the
floor function.
Layer (Size, Specifications)
Input Shape
Output Shape
7x7 conv, 64 kernels, stride 2, padding 3, with biases
224 x 224 x 3
112 x 112 x 64
3x3 max pooling, stride 2, padding 1
112 x 112 x 64
56 x 56 x 64
3x3 conv, 128 kernels, stride 2, with biases
56 x 56 x 64
27 x 27 x 128
3x3 conv, 256 kernels, stride 2, padding 1, with biases
27 x 27 x 128
13 x 13 x 256
3x3 conv, 512 kernels, stride 2, padding 1, with biases
13 x 13 x 256
7 x 7 x 512
Table 1: Architecture of a CNN
Marking scheme:
 0.25 point for number of channels
 0.25 point for height and width
 -1 point if the format is wrong, e.g. (c, h, w) instead of (h, w, c)
(d)
[2 points] Assume we want to add a fully-connected layer at the back of the CNN as
described in part (b). However, the output of the resulting network still contains a large
number of parameters. State what we should do if we want to reduce the number of
parameters and summarize the output.
Hint: We want to turn N × N × 512 output to 1 × 1 × 512.
Remark: Convolution layer is not an acceptable answer.
Answer:
Max pooling or average pooling.
Marking scheme:
 2 points for giving the correct answer.
 -0.5 point if type of pooling wasn’t specified.
 -0.5 point if pooling is not mentioned. (e.g. taking average) As pooling is a standard
component of neural networks.
 1 point for resize.
(e)
[2.5 points] Calculate the total number of parameters of the model? (i.e. for all the
layers described in Table 1).
Answer:
The total number of parameters =(7 × 7 × 3 × 64 + 64)+
(3 × 3 × 64 × 128 + 128)+
(3 × 3 × 128 × 256 + 256)+
(3 × 3 × 256 × 512 + 512)
=9472 + 73856 + 295168 + 1180160
=1558656
Marking scheme:
 2.5 point for giving the correct answer.
(f) Assume the model should classify images into 1000 distinct classes by
(i) [2.5 points] Appending the network shown in Table 2 to the end of the CNN network
(i.e. the one described in Table 1) appended with the structure that you added in
part (c).
Layer (Size, Specifications)
Input Shape
Output Shape
Flatten
1 x 1 x 512
Fully-connected (Flatten before feed-in)
Table 2: Classification network
Calculate the total number of parameters in the whole network (i.e. the layers de-
scribed in Table 1, your answer in part (d), and the layers described in Table 2).
Answer:
The total number of parameters =1558656 + (512 × 1000 + 1000)
=1558656 + 513000
=2071656
Marking scheme:
 2.5 point for giving the correct answer.
(ii) [2.5 points] Now, suppose we use the MLP described in Table 3 to classify the images
instead of CNN.
Layer (Size, Specifications)
Input Shape
Output Shape
Flatten
224 x 224 x 3
Fully-connected (Flatten before feed-in)
Table 3: Classification using MLP
Calculate the total number of parameters in the MLP (i.e. only those described in
Table 3).
Answer:
The total number of parameters =(150528 × 1000 + 1000)
=150529000
Marking scheme:
 2.5 point for giving the correct answer.
(g)
[2 points] State what you observe in part (f). Also, state the property of CNN, which
leads to this difference.
Answer:
The number of parameters of CNN is significantly less than the number of parameters of
pure MLP. This is because CNN uses shared parameters (kernels) to process the input
instead of using 1 parameter for each individual input.
Marking scheme:
 1 point for observations. If the observation is wrong, explanation is ignored.
 1 point for explanation if mentioned “large scale extraction” but “feature extraction”
only gains 0.5 point.
 1 point for “sparse connection”
 0 point for reducing output size.
As fully-connected layer could also reduce the
output size for the following fc layer, and yet the number of parameter is even more
bloated.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2022-spring-final-part-b', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [13 points] Python Programming: Convolutional Neural Network
Suppose the following CNN model learns human emotions from RGB images of faces and
classifies into 12 emotion categories.
Assumptions: All layers have no padding. Both two convolutional layers have 3 × 3 kernels.
The model is trained with Adam optimizer in default learning rate, and the loss function is
categorical cross-entropy. You don’t need to specify extra metrics like accuracy.
(a)
[9 points] According to all the information given above, write the Python codes to con-
struct and compile the model using Keras library. The following import statements are
provided for you. Also, a reference of useful Keras classes and functions are given in the
appendix.
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Flatten
(b)
[4 points] Now instead of classification, a student wants to predict a single continuous
real value from an input picture: how positive (or negative) the emotion is, ranging from
0 to 1. Is CNN in general able to do that? If yes, derive a model to complete such a task
from part (a) model. Point out which parts you will change when building and compiling
the model (if any) and explain how you will change it. (You don’t have to write codes,
but state clearly your ideas). If no, also explain why.
Appendix:
Below are some Keras documentation for your reference. Some irrelevant parameters are
omitted for conciseness.
Sequential class
tf.keras.Sequential(layers=None, name=None)
Sequential groups a linear stack of layers into a tf.keras.Model
 add Method
Sequential.add(layer)
– layer: layer instance
 compile Method
Model.compile(optimizer="rmsprop", loss=None)
– optimizer: String (name of optimizer) or optimizer instance.
– loss: Loss function.
Conv2D class
tf.keras.layers.Conv2D(
filters, kernel_size, strides=(1, 1), padding="valid", activation=None,
)
2D convolution layer (e.g. spatial convolution over images). When using this layer as the
first layer in a model, provide the keyword argument input shape (tuple of integers, does
not include the sample axis), e.g. input shape=(128,128,3) for 128x128 RGB pictures in
data format="channels last".
 filters: Integer, the dimensionality of the output space (i.e. the number of output filters
in the convolution).
 kernel size: An integer or tuple/list of 2 integers, specifying the height and width of the
2D convolution window. Can be a single integer to specify the same value for all spatial
dimensions.
 strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution
along the height and width. Can be a single integer to specify the same value for all
spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any
dilation rate value != 1.
 padding: one of “valid” or “same” (case-insensitive).
“valid” means no padding.
“same” results in padding with zeros evenly to the left/right or up/down of the input.
When padding="same" and strides=1, the output has the same size as the input.
 activation: Activation function to use. If you don’t specify anything, no activation is
applied.
MaxPooling2D class
tf.keras.layers.MaxPooling2D(
pool_size=(2, 2), strides=None, padding="valid",
)
Max pooling operation for 2D spatial data.
 pool size: integer or tuple of 2 integers, window size over which to take the maximum.
(2, 2) will take the max value over a 2x2 pooling window. If only one integer is specified,
the same window length will be used for both dimensions.
 strides: Integer, tuple of 2 integers, or None.
Strides values.
Specifies how far the
pooling window moves for each pooling step. If None, it will default to pool size.
 padding: One of “valid” or “same” (case-insensitive).
“valid” means no padding.
“same” results in padding evenly to the left/right or up/down of the input such that
output has the same height/width dimension as the input.
Flatten class
tf.keras.layers.Flatten()
Flattens the input.
Dense class
tf.keras.layers.Dense(
units, activation=None,
)
Regular densely-connected NN layer.
 units: Positive integer, dimensionality of the output space.
 activation: Activation function to use. If you don’t specify anything, no activation is
applied.
Common activation functions (in shorthand strings): “relu”, “sigmoid”, “softmax”
Common loss functions (in shorthand strings):
“categorical crossentropy”,
“sparse categorical crossentropy”,
“mean squared error” (same as “MSE”),
“mean absolute error” (same as “MAE”)
Common optimizer (in shorthand strings): “adam”', 13, 5, NULL::jsonb, NULL, NULL, 'Problem 2 [13 points] Python Programming: Convolutional Neural Network
Suppose the following CNN model learns human emotions from RGB images of faces and
classifies into 12 emotion categories.
Assumptions: All layers have no padding. Both two convolutional layers have 3 × 3 kernels.
The model is trained with Adam optimizer in default learning rate, and the loss function is
categorical cross-entropy. You don’t need to specify extra metrics like accuracy.
(a)
[9 points] According to all the information given above, write the Python codes to con-
struct and compile the model using Keras library. The following import statements are
provided for you. Also, a reference of useful Keras classes and functions are given in the
appendix.
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Flatten
Answer:
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=(3, 3), activation=''relu'',
strides=(3,3), input_shape=(32, 32, 1)))
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation=''relu'', stride=(2,2)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(units=12, activation=''softmax''))
model.compile(optimizer=''adam'', loss=''categorical_crossentropy'')
Marking scheme:
 Apply the following rules until the score is 0.
– -1 for each of Pooling, Flatten, Dense, Compile lines if it is missing or has any
wrong parameter
– -1 for each of the wrong or missing parameters in a Conv2D layer.
This in-
cludes any parameter that is good by default but you break it by setting another
absolutely wrong value.
– -1 if the input shape is not specified in any manner
– -1 for syntax errors like messed up brackets, the layers are not actually added to
a model, etc. Small typos on keywords, etc. are not penalized
(b)
[4 points] Now instead of classification, a student wants to predict a single continuous
real value from an input picture: how positive (or negative) the emotion is, ranging from
0 to 1. Is CNN in general able to do that? If yes, derive a model to complete such a task
from part (a) model. Point out which parts you will change when building and compiling
the model (if any) and explain how you will change it. (You don’t have to write codes,
but state clearly your ideas). If no, also explain why.
Answer:
Yes, CNN is able to do that in general, and the following parts need to be changed.
 Change the Dense layer to units=1.
 Change the Dense layer activation to any other than softmax, e.g., relu or sigmoid.
 Change the loss to any regression loss, e.g., MSE, MAE.
Marking scheme:
 1 point for stating CNN is able to do that in general.
 1 point for changing the output Dense unit to 1
 1 point for changing output activation to anything not related to probability and
range containing [0,1] (sigmoid, linear, relu, etc.)
 1 point for changing the loss function to any real value comparison (subtraction),
e.g. Mean Squared Error, Mean Absolute Error, or other similar self-defined losses.
 -0.5 point for each change if the student doesn’t specify to what it changes or changes
it to a wrong value.
Appendix:
Below are some Keras documentation for your reference. Some irrelevant parameters are
omitted for conciseness.
Sequential class
tf.keras.Sequential(layers=None, name=None)
Sequential groups a linear stack of layers into a tf.keras.Model
 add Method
Sequential.add(layer)
– layer: layer instance
 compile Method
Model.compile(optimizer="rmsprop", loss=None)
– optimizer: String (name of optimizer) or optimizer instance.
– loss: Loss function.
Conv2D class
tf.keras.layers.Conv2D(
filters, kernel_size, strides=(1, 1), padding="valid", activation=None,
)
2D convolution layer (e.g. spatial convolution over images). When using this layer as the
first layer in a model, provide the keyword argument input shape (tuple of integers, does
not include the sample axis), e.g. input shape=(128,128,3) for 128x128 RGB pictures in
data format="channels last".
 filters: Integer, the dimensionality of the output space (i.e. the number of output filters
in the convolution).
 kernel size: An integer or tuple/list of 2 integers, specifying the height and width of the
2D convolution window. Can be a single integer to specify the same value for all spatial
dimensions.
 strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution
along the height and width. Can be a single integer to specify the same value for all
spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any
dilation rate value != 1.
 padding: one of “valid” or “same” (case-insensitive).
“valid” means no padding.
“same” results in padding with zeros evenly to the left/right or up/down of the input.
When padding="same" and strides=1, the output has the same size as the input.
 activation: Activation function to use. If you don’t specify anything, no activation is
applied.
MaxPooling2D class
tf.keras.layers.MaxPooling2D(
pool_size=(2, 2), strides=None, padding="valid",
)
Max pooling operation for 2D spatial data.
 pool size: integer or tuple of 2 integers, window size over which to take the maximum.
(2, 2) will take the max value over a 2x2 pooling window. If only one integer is specified,
the same window length will be used for both dimensions.
 strides: Integer, tuple of 2 integers, or None.
Strides values.
Specifies how far the
pooling window moves for each pooling step. If None, it will default to pool size.
 padding: One of “valid” or “same” (case-insensitive).
“valid” means no padding.
“same” results in padding evenly to the left/right or up/down of the input such that
output has the same height/width dimension as the input.
Flatten class
tf.keras.layers.Flatten()
Flattens the input.
Dense class
tf.keras.layers.Dense(
units, activation=None,
)
Regular densely-connected NN layer.
 units: Positive integer, dimensionality of the output space.
 activation: Activation function to use. If you don’t specify anything, no activation is
applied.
Common activation functions (in shorthand strings): “relu”, “sigmoid”, “softmax”
Common loss functions (in shorthand strings):
“categorical crossentropy”,
“sparse categorical crossentropy”,
“mean squared error” (same as “MSE”),
“mean absolute error” (same as “MAE”)
Common optimizer (in shorthand strings): “adam”', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2022-spring-final-part-b', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [8 points] Minimax and Alpha-Beta Pruning
Two players, MAX and MIN, are playing a game that can be represented by a tree, as shown
below.
(a)
[3.5 points] Complete the following table by estimating the minimax value of each non-
terminal node.
A
B
C
D
E
F
G
(b)
[0.5 point] State the proper move of the maximizer by writing down one of the root’s
outgoing edges (i.e. E-a or E-b).
Note: The root node of the given tree is A.
(c)
[1.5 points] State whether minimax-based AI will choose to make a move which will
result in a slower victory. Explain your answer.
(d)
[2.5 points] Suppose we now apply alpha-beta pruning on the game tree. Indicate the
edge(s) that would be pruned (eliminated from consideration) by writing down the edge
labels (i.e. E-a, E-b, E-c, . . ., E-n). You may assume that the branches are explored
from left to right.', 8, 9, NULL::jsonb, NULL, NULL, 'Problem 3 [8 points] Minimax and Alpha-Beta Pruning
Two players, MAX and MIN, are playing a game that can be represented by a tree, as shown
below.
(a)
[3.5 points] Complete the following table by estimating the minimax value of each non-
terminal node.
Answer:
A
B
C
-1
D
E
F
-1
G
Marking scheme:
 0.5 point for each correct answer. 3.5 points in total.
(b)
[0.5 point] State the proper move of the maximizer by writing down one of the root’s
outgoing edges (i.e. E-a or E-b).
Note: The root node of the given tree is A.
Answer:
E-a
Marking scheme:
 0.5 point for the correct answer.
(c)
[1.5 points] State whether minimax-based AI will choose to make a move which will
result in a slower victory. Explain your answer.
Answer:
Yes, minimax-based AI may choose to make a move, resulting in a slower victory. Since
we may have two moves with the same maximum minimax value, it picks the one in
slower victory.
Marking scheme:
 1 point for stating minimax-based AI will choose to make a move which will result
in a slower victory.
 0.5 point for giving the proper explanation.
(d)
[2.5 points] Suppose we now apply alpha-beta pruning on the game tree. Indicate the
edge(s) that would be pruned (eliminated from consideration) by writing down the edge
labels (i.e. E-a, E-b, E-c, . . ., E-n). You may assume that the branches are explored
from left to right.
Answer:
E-j and E-f would be pruned.
Marking scheme:
 If E-j appears: +1 point.
 If E-f appears: +1.5 point.
 If E-m or E-n appears: +0 point.
 If other label appears: then d) become 0 points, no matter E-j and E-f appears or
not.', ARRAY['Search and Games']::TEXT[], 'Search and Games', 'Search and Games', ARRAY['Search and Games']::TEXT[], ARRAY['tree_search', 'pruning', 'manual_tracing']::TEXT[], 'easy', '', '', ''),
    ('COMP2211-2022-spring-final-part-b', '4', NULL, 4, 'long_question', 'short_answer', 'Problem 4 [7 points] Ethics of Artificial Intelligence
This question consists of five sub-questions, four of them are multiple-choice questions, and
one is a short question. Choose the BEST ANSWER among the given choices for each
multiple-choice question and put your answer in the given table, while for the short question,
answer it in a few sentences.
(a)
[1 point] Ethics in artificial intelligence is
(A) Something that is not an issue.
(B) Something that somebody else will do in the future.
(C) Something that we need to apply today.
(D) Something that is entirely solved in current AI systems.
(b)
[1 point] One approach that helps developers avoid unintentionally creating bias in AI
systems is
(A) Using a wide variety of appropriately diverse data for training.
(B) Using highly specific training data from a narrow range.
(C) Not using any training data.
(D) None of the above
(c)
[1 point] What are some of the ethical concerns around artificial intelligence?
I. Racial, gender or other types of bias.
II. Loss of jobs due to AI replacing workers performing repetitive tasks.
III. Concern about the trustworthiness of decision-making supported by AI systems.
IV. Privacy, for example, as human faces are photographed and recognized in public
spaces.
(A) I and II only
(B) I, II, and IV only
(C) All of the above
(D) None of the above
(d) [1 point] What is a significantly way in which developers of AI systems can guard against
introducing bias?
(A) Using only examples from their own environment as training data.
(B) Providing effective training data and performing regular tests and audits.
(C) Using less varied AI systems and datasets.
(D) Using government approved algorithms.
Question
Answer
(a)
(b)
(c)
(d)
(e)
[3 points] State THREE ethical issues involved with the introduction of autonomous
vehicles.', 7, 10, NULL::jsonb, NULL, NULL, 'Problem 4 [7 points] Ethics of Artificial Intelligence
This question consists of five sub-questions, four of them are multiple-choice questions, and
one is a short question. Choose the BEST ANSWER among the given choices for each
multiple-choice question and put your answer in the given table, while for the short question,
answer it in a few sentences.
(a)
[1 point] Ethics in artificial intelligence is
(A) Something that is not an issue.
(B) Something that somebody else will do in the future.
(C) Something that we need to apply today.
(D) Something that is entirely solved in current AI systems.
(b)
[1 point] One approach that helps developers avoid unintentionally creating bias in AI
systems is
(A) Using a wide variety of appropriately diverse data for training.
(B) Using highly specific training data from a narrow range.
(C) Not using any training data.
(D) None of the above
(c)
[1 point] What are some of the ethical concerns around artificial intelligence?
I. Racial, gender or other types of bias.
II. Loss of jobs due to AI replacing workers performing repetitive tasks.
III. Concern about the trustworthiness of decision-making supported by AI systems.
IV. Privacy, for example, as human faces are photographed and recognized in public
spaces.
(A) I and II only
(B) I, II, and IV only
(C) All of the above
(D) None of the above
(d) [1 point] What is a significantly way in which developers of AI systems can guard against
introducing bias?
(A) Using only examples from their own environment as training data.
(B) Providing effective training data and performing regular tests and audits.
(C) Using less varied AI systems and datasets.
(D) Using government approved algorithms.
Question
Answer
(a)
C
(b)
A
(c)
C
(d)
B
Marking scheme:
 1 point for each correct answer. 4 points in total.
(e)
[3 points] State THREE ethical issues involved with the introduction of autonomous
vehicles.
Answer:
 Who is to blame in an accident?
 In an emergency situation who should the car prioritize?
 Increase in use of cars is bad for the environment.
 Cost of the cars.
Marking scheme:
 1 point for giving each correct ethical issue. 3 points in total.', ARRAY['Ethics of AI']::TEXT[], 'Ethics of AI', 'Ethics of AI', ARRAY['Ethics of AI']::TEXT[], ARRAY['concept_explanation', 'argumentation', 'comparison']::TEXT[], 'easy', '', '', ''),
    ('COMP2211-2023-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions
Indicate whether the following statements are true or false by circling T or F. You get 1
point for each correct answer.
(a)
T
F
Na¨ıve Bayes classifier is a probabilistic algorithm that computes the probability of an
instance belonging to each class and selects the class with the highest probability as the
output.
(b)
T
F
Na¨ıve Bayes classifier can be used for multi-class classification task.
(c)
T
F
K-Nearest Neighbors is a supervised learning and parametric algorithm that can be used
to solve both classification and regression problems.
(d)
T
F
In K-Nearest Neighbors algorithm, the value of K should always be odd to avoid ties.
(e)
T
F
In D-fold cross validation, an increase of D will result in a longer time required to cross-
validate the result.
(f)
T
F
After centroids initialization, K-Means Clustering is sensitive to the order in which the
data points are processed, meaning that changing the order of the input data points may
lead to different clustering results.
(g)
T
F
K-Median Clustering is robust to the presence of outliers and noise in the dataset, as it
uses the median of the data points as the center of each cluster.
Note: The median is the middle number in a sorted, ascending or descending list of
numbers.
(h)
T
F
A perceptron with different initialization of weights and bias may result in different
decision boundaries.
(i)
T
F
For perceptron, larger learning rates always lead to faster convergence.
(j)
T
F
Multilayer Perceptron with more layers are more expressive than Single Layer Perceptron
regardless of the activation function is linear or not.', 10, 3, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions
Indicate whether the following statements are true or false by circling T or F. You get 1
point for each correct answer.
(a)
T
F
Na¨ıve Bayes classifier is a probabilistic algorithm that computes the probability of an
instance belonging to each class and selects the class with the highest probability as the
output.
(b)
T
F
Na¨ıve Bayes classifier can be used for multi-class classification task.
(c)
T
F
K-Nearest Neighbors is a supervised learning and parametric algorithm that can be used
to solve both classification and regression problems.
(d)
T
F
In K-Nearest Neighbors algorithm, the value of K should always be odd to avoid ties.
(e)
T
F
In D-fold cross validation, an increase of D will result in a longer time required to cross-
validate the result.
(f)
T
F
After centroids initialization, K-Means Clustering is sensitive to the order in which the
data points are processed, meaning that changing the order of the input data points may
lead to different clustering results.
(g)
T
F
K-Median Clustering is robust to the presence of outliers and noise in the dataset, as it
uses the median of the data points as the center of each cluster.
Note: The median is the middle number in a sorted, ascending or descending list of
numbers.
(h)
T
F
A perceptron with different initialization of weights and bias may result in different
decision boundaries.
(i)
T
F
For perceptron, larger learning rates always lead to faster convergence.
(j)
T
F
Multilayer Perceptron with more layers are more expressive than Single Layer Perceptron
regardless of the activation function is linear or not.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2023-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [21 points] Python Fundamentals
(a)
[9 points] Consider the following Numpy arrays:
import numpy as np
# np.arange(stop)
# return an array of evenly spaced values within the half-open interval [0,stop),
# the default step size is 1.
A = np.arange(10)
B = np.array([[5, 10, 15],
[20, 25, 30],
[35, 40, 45]])
Write the output for each of the following Python statements. If the output is an empty
array, write “Empty Array”. If an error occurs, write “Error”.
(i) print(A[1:5])
(ii) print(A[1:5:2])
(iii) print(A[5::-2])
(iv) print(B[:2,1:])
(v) print(B[A[:1]])
(vi) print(B[A[1:]])
(vii) A[A%3 == 0] = 100
print(A)
(viii) # np.mean(a, axis)
# return the average of the array elements over the specified axis.
print(np.mean(B, axis=-1))
(ix) # np.ndarray.transpose(*axis)
# return a view of the array with axes transposed.
print(B.transpose((1,0)))
(b)
[6 points] Write the output for the following Python code segments. If the output is an
empty array, write “Empty Array”. If an error occurs, write “Error”.
(i) import numpy as np
A = np.array([1,2])
B = np.array([[1,2],
[2,4],
[3,6],
[4,8]])
print(B/A)
(ii) import numpy as np
A = np.array([[0, 0, 0],
[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
B = np.array([10, 11, 12, 13])
print(A + B)
(iii) import numpy as np
A = np.array([0, 10, 20, 30])
B = np.array([1, 2, 3])
# np.newaxis
# increase the dimension of an array by adding new axis
print(A[:, np.newaxis] + B)
(c)
[6 points] The distance function between two points x and y in the Poincare ball model
can be calculated by the following formula:
arccosh

1 + 2
||x −y||2
(1 −||x||2)(1 −||y||2)

= arccosh

1 + 2
Pn
i=1(xi −yi)2
(1 −Pn
i=1 x2
i )(1 −Pn
i=1 y2
i )

For example, if x = (0.0, 0.0), y = (0.0, 0.1), their Poincare distance is
arccosh

1 + 2
(0.0 −0.0)2 + (0.0 −0.1)2
(1 −(0.02 + 0.02))(1 −(0.02 + 0.12))

≈0.2006707
Given the following NumPy arrays, X and Y, where each 1-D array represents a data
point:
X = np.array([[0.0, 0.0], [0.1, 0.1], [0.2, 0.2]])
Y = np.array([[0, 0.1], [0.2, 0.3]])
Compute the Poincare distance between each data point in X and each data point in Y
with a one-line Python expression, such that the result of the expression is
[[0.2006707 0.75504766]
[0.2027011 0.47971794]
[0.46441642 0.22308802]]
Note:
 An expression is a combination of values, variables, operators, and calls to functions.
Your expression should work with any numbers of data points in X and Y and any
number of values in the data points.
 You can assume that the number of attribute values in each data point are the same
for both X and Y.
 There must be no explicit loops in your expression.
You may find the following attribute or functions useful for this question.
 numpy.expand_dims(a, axis)
Insert a new axis to a that will appear at the axis position in the expanded array
shape.
 numpy.square(x)
Return the element-wise square of the input x.
 numpy.sum(a, axis)
Return the sum of array a’s elements over a given axis.
 a.T
The transposed array of a.
 numpy.matmul(a, b)
Return the matrix product of a and b.
 numpy.arccosh(x)
Return the element-wise arccosh of the input x.
Write your one-line Python expression below.', 21, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [21 points] Python Fundamentals
(a)
[9 points] Consider the following Numpy arrays:
import numpy as np
# np.arange(stop)
# return an array of evenly spaced values within the half-open interval [0,stop),
# the default step size is 1.
A = np.arange(10)
B = np.array([[5, 10, 15],
[20, 25, 30],
[35, 40, 45]])
Write the output for each of the following Python statements. If the output is an empty
array, write “Empty Array”. If an error occurs, write “Error”.
(i) print(A[1:5])
Answer:
[1 2 3 4]
(ii) print(A[1:5:2])
Answer:
[1 3]
(iii) print(A[5::-2])
Answer:
[5 3 1]
(iv) print(B[:2,1:])
Answer:
[[10 15]
[25 30]]
(v) print(B[A[:1]])
Answer:
[[5 10 15]]
(vi) print(B[A[1:]])
Answer:
Error
(vii) A[A%3 == 0] = 100
print(A)
Answer:
[100 1 2 100 4 5 100 7 8 100]
(viii) # np.mean(a, axis)
# return the average of the array elements over the specified axis.
print(np.mean(B, axis=-1))
Answer:
[10 25 40]
(ix) # np.ndarray.transpose(*axis)
# return a view of the array with axes transposed.
print(B.transpose((1,0)))
Answer:
[[5 20 35]
[10 25 40]
[15 30 45]]
Marking Scheme:
 1 point for each sub-question. No partial point. The brackets must be correct in
order to get the points. 9 points in total.
(b)
[6 points] Write the output for the following Python code segments. If the output is an
empty array, write “Empty Array”. If an error occurs, write “Error”.
(i) import numpy as np
A = np.array([1,2])
B = np.array([[1,2],
[2,4],
[3,6],
[4,8]])
print(B/A)
Answer:
[[1 1]
[2 2]
[3 3]
[4 4]]
(ii) import numpy as np
A = np.array([[0, 0, 0],
[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
B = np.array([10, 11, 12, 13])
print(A + B)
Answer: Error
(iii) import numpy as np
A = np.array([0, 10, 20, 30])
B = np.array([1, 2, 3])
# np.newaxis
# increase the dimension of an array by adding new axis
print(A[:, np.newaxis] + B)
Answer:
[[1 2 3]
[11 12 13]
[21 22 23]
[31 32 33]]
Marking Scheme:
 2 points for each sub-question. No partial point. The bracket must be correct in
order to get the points. 6 points in total.
(c)
[6 points] The distance function between two points x and y in the Poincare ball model
can be calculated by the following formula:
arccosh

1 + 2
||x −y||2
(1 −||x||2)(1 −||y||2)

= arccosh

1 + 2
Pn
i=1(xi −yi)2
(1 −Pn
i=1 x2
i )(1 −Pn
i=1 y2
i )

For example, if x = (0.0, 0.0), y = (0.0, 0.1), their Poincare distance is
arccosh

1 + 2
(0.0 −0.0)2 + (0.0 −0.1)2
(1 −(0.02 + 0.02))(1 −(0.02 + 0.12))

≈0.2006707
Given the following NumPy arrays, X and Y, where each 1-D array represents a data
point:
X = np.array([[0.0, 0.0], [0.1, 0.1], [0.2, 0.2]])
Y = np.array([[0, 0.1], [0.2, 0.3]])
Compute the Poincare distance between each data point in X and each data point in Y
with a one-line Python expression, such that the result of the expression is
[[0.2006707 0.75504766]
[0.2027011 0.47971794]
[0.46441642 0.22308802]]
Note:
 An expression is a combination of values, variables, operators, and calls to functions.
Your expression should work with any numbers of data points in X and Y and any
number of values in the data points.
 You can assume that the number of attribute values in each data point are the same
for both X and Y.
 There must be no explicit loops in your expression.
You may find the following attribute or functions useful for this question.
 numpy.expand_dims(a, axis)
Insert a new axis to a that will appear at the axis position in the expanded array
shape.
 numpy.square(x)
Return the element-wise square of the input x.
 numpy.sum(a, axis)
Return the sum of array a’s elements over a given axis.
 a.T
The transposed array of a.
 numpy.matmul(a, b)
Return the matrix product of a and b.
 numpy.arccosh(x)
Return the element-wise arccosh of the input x.
Write your one-line Python expression below.
Answer:
print(np.arccosh(1 + 2 * np.sum((np.expand_dims(X, axis=1) - Y) ** 2, axis=2) /
np.matmul(np.expand_dims(1 - np.sum(X ** 2, axis=1), axis=1),
np.expand_dims(1 - np.sum(Y ** 2, axis=1), axis=1).T)))
Marking Scheme:
 1.5 points: correct usage of np.expand dims(), 0.5 points each. The axis must be
correct in order to get the points. Can be replaced by np.newaxis or other equivalent
functions.
 1.5 points: correct usage of np.sum(), 0.5 points each. The axis must be correct in
order to get the points.
 1.5 points: Ccorrect usage of np.square(), 0.5 points each. Can be replaced by **2.
 0.5 points: correct usage of np.matmul(). The second argument should have the
transpose (np.transpose() or T.). But if the second no.expand dims() in the denom-
inator is expanded in axis = 0, the transpose should not occur. Can be replaced by
np.dot() or @.
 0.5 points: correct usage of np.arccosh().
 0.5 points: correct basic formula as arccosh(1 + 2*(nominator/denominator)). Gen-
erally the students can get this 0.5 points if they write their answers.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2023-spring-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [8 points] Na¨ıve Bayes Classifier
Given the following training dataset
Humidity
Wind
Temperature
Play Tennis
Day 1
High
Weak
Mild
Yes
Day 2
?
Weak
Cool
Yes
Day 3
Normal
Strong
Cool
No
Day 4
Normal
Strong
?
Yes
Day 5
High
Weak
Hot
No
Suppose we want to classify whether we will play tennis on Day 6 with the following attributes:
Humidity=High, Wind=Weak, Temperature=?
using Na¨ıve Bayes Classifier with the given training dataset.
Unfortunately, some of the training data is missing, which is denoted as “?”. Can Na¨ıve
Bayes Classifier handle this dataset with missing data? i.e., Can you still yield a classifica-
tion output of whether you will play tennis on Day 6?
If not, explain why. Otherwise, show how you will approach this problem.
In your answer, you should clearly mention why your explanation is valid with reference to
the property of the Na¨ıve Bayes algorithm.', 8, 9, NULL::jsonb, NULL, NULL, 'Problem 3 [8 points] Na¨ıve Bayes Classifier
Given the following training dataset
Humidity
Wind
Temperature
Play Tennis
Day 1
High
Weak
Mild
Yes
Day 2
?
Weak
Cool
Yes
Day 3
Normal
Strong
Cool
No
Day 4
Normal
Strong
?
Yes
Day 5
High
Weak
Hot
No
Suppose we want to classify whether we will play tennis on Day 6 with the following attributes:
Humidity=High, Wind=Weak, Temperature=?
using Na¨ıve Bayes Classifier with the given training dataset.
Unfortunately, some of the training data is missing, which is denoted as “?”. Can Na¨ıve
Bayes Classifier handle this dataset with missing data? i.e., Can you still yield a classifica-
tion output of whether you will play tennis on Day 6?
If not, explain why. Otherwise, show how you will approach this problem.
In your answer, you should clearly mention why your explanation is valid with reference to
the property of the Na¨ıve Bayes algorithm.
Answer:
Na¨ıve Bayes can handle a dataset with missing value.
Attributes are handled separately
by the algorithm at both model construction time and prediction time. Therefore, the miss-
ing attributes can simply be ignored while preparing the model, and also ignored when a
probability is calculated for a class value.
Marking Scheme:
 Case 1: Student’s answer is a clear “Yes”, or suggests that the Na¨ıve Bayes can be used
in this scenario.
– 2 points for indicating/suggesting that Na¨ıve Bayes can be used in this scenario.
– 3 points for mentioning the property of the Na¨ıve Bayes, where attributes are handled
separately and/or independently.
– 3 points for mentioning the data entry with the missing value in the table can simply
be ignored.
However, answers that suggest ignoring the data for the entire day
(row), or the data for the entire attribute (column) are not acceptable.
Answer
should also include that the attribute “Temperature” for Day 6 can be ignored for
when calculating the classification result.
 Case 2: Student’s answer is a clear “No”.
– 0 point, and partial credit is NOT given for the subsequent explanation. Because,
whatever explanation is for the wrong claim.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'easy', '', '', ''),
    ('COMP2211-2023-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors
Suppose you are building a K-Nearest Neighbors classifier that predicts user preference on
movie genre based on their rating history (a 5-point scale).
User
Movie Ratings
Preferred Genre
Movie1
Movie2
Movie3
Alice
Romance
Bob
Romance
Charlie
Action
Table 1: Training Dataset
User
Movie Ratings
Preferred Genre
Movie1
Movie2
Movie3
David
?
Table 2: Test Dataset
(a)
[8 points] Based on the movie ratings in Table 1 and Table 2, calculate the Cosine and
Euclidean distance between ratings of each training data and test data (round to the
third decimal place). Fill in the distance columns in the following table, and determine
the class of David for each distance when K=1. You can find the following approxima-
tions and equations helpful for this question.
Approximated values:
√
5 ≈2.236
√
12 ≈3.464
√
14 ≈3.742
√
√
22 =
√
242 ≈15.556
√
√
66 =
√
1452 ≈38.105
Cosine distance formula:
cosθ =
Pn
i=1 Xtrain
i
× Xtest
i
qPn
i=1(Xtrain
i
)2
 pPn
i=1(Xtest
i
)2

Cosine Distance = 1 −cosθ
where Xtrain
i
and Xtest
i
are the feature value of training data and test data, respectively,
and n is the number of features.
Euclidean distance formula:
Euclidean Distance =
v
u
u
t
n
X
i=1
(Xtrain
i
−Xtest
i
)2
where Xtrain
i
and Xtest
i
are the feature value of training data and test data, respectively,
and n is the number of features.
User
Movie Ratings
Movie1
Movie2
Movie3
Preferred
Genre
Cosine
Distance
Euclidean
Distance
Alice
Romance
Bob
Romance
Charlie
Action
David
?
Class for cosine distance:
Class for Euclidean distance:
(b)
[6 points] Let’s say that Alice tends to rate movies highly, while David tends to rate
them relatively poorly. Given this scenario, which distance metric is more appropriate?
Also, describe why it makes the classifier better.', 14, 10, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors
Suppose you are building a K-Nearest Neighbors classifier that predicts user preference on
movie genre based on their rating history (a 5-point scale).
User
Movie Ratings
Preferred Genre
Movie1
Movie2
Movie3
Alice
Romance
Bob
Romance
Charlie
Action
Table 1: Training Dataset
User
Movie Ratings
Preferred Genre
Movie1
Movie2
Movie3
David
?
Table 2: Test Dataset
(a)
[8 points] Based on the movie ratings in Table 1 and Table 2, calculate the Cosine and
Euclidean distance between ratings of each training data and test data (round to the
third decimal place). Fill in the distance columns in the following table, and determine
the class of David for each distance when K=1. You can find the following approxima-
tions and equations helpful for this question.
Approximated values:
√
5 ≈2.236
√
12 ≈3.464
√
14 ≈3.742
√
√
22 =
√
242 ≈15.556
√
√
66 =
√
1452 ≈38.105
Cosine distance formula:
cosθ =
Pn
i=1 Xtrain
i
× Xtest
i
qPn
i=1(Xtrain
i
)2
 pPn
i=1(Xtest
i
)2

Cosine Distance = 1 −cosθ
where Xtrain
i
and Xtest
i
are the feature value of training data and test data, respectively,
and n is the number of features.
Euclidean distance formula:
Euclidean Distance =
v
u
u
t
n
X
i=1
(Xtrain
i
−Xtest
i
)2
where Xtrain
i
and Xtest
i
are the feature value of training data and test data, respectively,
and n is the number of features.
User
Movie Ratings
Movie1
Movie2
Movie3
Preferred
Genre
Cosine
Distance
Euclidean
Distance
Alice
Romance
Bob
Romance
Charlie
Action
David
?
Class for cosine distance:
Class for Euclidean distance:
User
Movie Ratings
Movie1
Movie2
Movie3
Preferred
Genre
Cosine
Distance
Euclidean
Distance
Alice
Romance
0.003
3.464
Bob
Romance
0.029
3.741 or 3.742
Charlie
Action
0.100
2.236
David
?
Class for cosine distance: Romance
Class for Euclidean distance: Action
Marking Scheme:
 1 point for giving each correct value. 6 points in total.
– 0.5 point for each correct value that is not rounded to the 3rd decimal place.
– No point for others (e.g., square root values or equations).
 1 point for giving each correct class. 2 points in total.
(b)
[6 points] Let’s say that Alice tends to rate movies highly, while David tends to rate
them relatively poorly. Given this scenario, which distance metric is more appropriate?
Also, describe why it makes the classifier better.
Answer:
In this scenario, the Cosine distance metric performs better, indicating that the ground
truth class is “Romance”. The reason for this is that the Cosine distance metric con-
siders the direction of the vectors being compared, while the Euclidean distance metric
takes both direction and magnitude into account. Since the Cosine distance metric is
less sensitive to variations in magnitude, it is better suited for comparing vectors with
different scales. Given that this data contains bias in terms of scale, the Cosine distance
metric is a better choice in this case.
Marking Scheme:
 1 point for stating Cosine distance metric performs better, indicating that the ground
truth class is “Romance”.
 2 points for stating Cosine distance metric considers the direction, while Euclidean
distance metric takes both direction and magnitude into account.
– Note that direction or angle should be mentioned in the state of Cosine dis-
tance metric, while magnitude should be mentioned in the state of Euclidean
distance metric.
 3 points for stating Cosine distance metric is less sensitive to variations in mag-
nitude, and the data contain bias in terms of scale, Cosine distance is a better
choice.
– 1 point for stating that Cosine distance metric is less sensitive to variations.
– 1 point for stating that Cosine distance metric is less sensitive to the biased data
which is described in the problem.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2023-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [19 points] K-Means Clustering
Consider a 2-dimensional dataset with the following 5 data points:
A(1, 4), B(2, 3), C(4, 6), D(5, 7), E(8, 3)
Perform K-Means clustering on this dataset with K = 2. Use the following initial centroids
for your calculations:
Centroid 1: A(1, 4), Centroid 2: D(5, 7)
If a tie occurs, assign the points to Centroid 1. All calculations round to two decimal places.
(a) [7 points] Calculate the Euclidean distances between each data point and both centroids.
Fill in the distances in the table. Then, assign each point to the nearest centroid.
Distance
A
B
C
D
E
Centroid 1
Centroid 2
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
(b)
[7 points] Recalculate the centroids using the mean of the points assigned to each cen-
troid. Fill in the values of new centroids after C1 and C2 in the table. Then repeat the
process of assigning points and recalculating centroids until convergence. You may not
need all the provided table templates. Leave them blank if the algorithm has already
converged. Report the final cluster assignments and centroids.
Distance
A
B
C
D
E
C1(
,
)
C2(
,
)
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
Distance
A
B
C
D
E
C1(
,
)
C2(
,
)
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
Distance
A
B
C
D
E
C1(
,
)
C2(
,
)
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
(c) [5 points] The choice of the number of clusters K in K-Means clustering can significantly
impact the clustering results. Selecting too few or too many clusters can lead to overgen-
eralization or overfitting. Why this limitation is critical for K-Means clustering? Given
a dataset with an unknown number of clusters, please explain one way to determine a
suitable K.', 19, 12, NULL::jsonb, NULL, NULL, 'Problem 5 [19 points] K-Means Clustering
Consider a 2-dimensional dataset with the following 5 data points:
A(1, 4), B(2, 3), C(4, 6), D(5, 7), E(8, 3)
Perform K-Means clustering on this dataset with K = 2. Use the following initial centroids
for your calculations:
Centroid 1: A(1, 4), Centroid 2: D(5, 7)
If a tie occurs, assign the points to Centroid 1. All calculations round to two decimal places.
(a) [7 points] Calculate the Euclidean distances between each data point and both centroids.
Fill in the distances in the table. Then, assign each point to the nearest centroid.
Distance
A
B
C
D
E
Centroid 1
Centroid 2
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
Answers:
Distance
A
B
C
D
E
Centroid 1
1.41
3.61
5.00
7.07
Centroid 2
5.00
5.00
1.41
5.00
Data points assigned to Centroid 1: A, B
Data points assigned to Centroid 2: C, D, E
Marking Scheme:
 0.5 point for each correct value. 5 points in total.
 1 point for giving the data points assigned to each centroid. 2 points in total.
 -0.5 points for not rounding to two decimal places.
(b)
[7 points] Recalculate the centroids using the mean of the points assigned to each cen-
troid. Fill in the values of new centroids after C1 and C2 in the table. Then repeat the
process of assigning points and recalculating centroids until convergence. You may not
need all the provided table templates. Leave them blank if the algorithm has already
converged. Report the final cluster assignments and centroids.
Distance
A
B
C
D
E
C1(
,
)
C2(
,
)
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
Distance
A
B
C
D
E
C1(
,
)
C2(
,
)
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
Distance
A
B
C
D
E
C1(
,
)
C2(
,
)
Data points assigned to Centroid 1:
Data points assigned to Centroid 2:
Answer:
Distance
A
B
C
D
E
C1(1.5, 3.5)
0.71
0.71
3.54
4.95
6.52
C2(5.67, 5.33)
4.86
4.35
1.80
1.80
3.30
Data points assigned to Centroid 1: A, B
Data points assigned to Centroid 2: C, D, E
Marking Scheme:
 0.5 points for each correct value. 5 points in total.
 1 point for giving the data points assigned to each centroid. 2 points in total.
 -0.5 points for not rounding to two decimal places.
 -1 point for filling in/copying more than 1 table template (incorrect convergence
statement).
(c) [5 points] The choice of the number of clusters K in K-Means clustering can significantly
impact the clustering results. Selecting too few or too many clusters can lead to overgen-
eralization or overfitting. Why this limitation is critical for K-Means clustering? Given
a dataset with an unknown number of clusters, please explain one way to determine a
suitable K.
Answer:
Clustering is an unsupervised learning method. In the scenario where you try to adopt K-
Means clustering or any other unsupervised models, you don’t know anything about the
shape, the number of clusters, or the distribution of the dataset. Such value k is exactly
what you are looking for and will never be given beforehand. One common method to
determine the optimal number of clusters is the elbow method, which identifies the point
where adding more clusters does not improve the clustering performance significantly.
However, this method can be subjective and may not always yield the optimal number
of clusters. Other methods, such as silhouette analysis or gap statistics, can also be used
to determine the optimal number of clusters.
Marking Scheme:
 2 points for stating that K-Means is an unsupervised method and the best value K
will never be given beforehand.
 3 points for giving and explaining a way to determine a suitable K.
 -1 point for stating why the limitation is critical for K-Means clustering but the
answer is minor.
 -1 point for paraphrasing the prompt.
 -1 point for using only SSE to evaluate the model and use the K at minimum SSE.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2023-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [12 points] Perceptron
Given the following training dataset and test dataset:
Training
Test
x1
-2
-1
x2
-2
-1
T
-1
-1
-1
-1
(a)
[2 points] Suppose we use the training dataset to train a perceptron. At some point
during the training, the weights and bias of the perceptron have not changed after four
consecutive updates, but the current epoch is not yet finished.
Should we stop the
training? Explain why.
(b)
[2 points]
After the training has converged, what is the range of possible accuracy on the test
dataset?
A. 50%
B. 50% ∼75%
C. 50% ∼100%
D. 25% ∼75%
E. 0% ∼100%
(c)
[2 points] Consider the following implementation of a perceptron:
import numpy as np
def fit(X, T, W0, b0, eta, max_epochs):
W = np.ones(X.shape[0]) * W0
b = b0
for epoch in range(max_epochs):
for i in range(T.shape[0]):
O = predict(X, W, b)
E = T[i] - O
W = W + E * X
b = b + E
return W, b
def predict(X, W, b):
Z = np.dot(X, W) + b
O = (Z <= 0) * 2 - 1
return O
The predict function takes three inputs:
 A 1-D or 2-D NumPy array, X, storing test example(s). The shape of X is (d,) or (n,
d) where n is the number of test examples and d is the number of features;
 A 1-D NumPy array, W, storing the weights of the perceptron. The shape of W is (d,)
where d is the number features;
 A bias value, b, for the perceptron.
Based on how the predict function is implemented, what is the activation function f(z)
of this perceptron?
(d)
[6 points]
The fit function takes six inputs:
 A 2-D NumPy array, X, storing training examples. The shape of X is (m, d) where
m is the number of training examples and d is the number of features;
 A 1-D NumPy array, T, storing training targets. The shape of T is (m,) where m is
the number training examples;
 An initial value, W0, for the weights of the perceptron;
 An initial value, b0, for the bias of the perceptron;
 The learning rate, eta;
 The maximum number of training epochs, max epochs.
Given these inputs, however, the fit function is not implemented correctly. Identify all
the errors by giving the line numbers that cause the errors, and propose ways to fix them.
You may find the following attribute or functions useful for this question.
 numpy.ones(shape)
Return a new array of given shape, shape, filled with ones.
 range(stop)
Return a sequence of numbers starting from 0, incrementing by 1, and ending at the
value stop.
 ndarray.shape
Tuple of array dimensions.
 numpy.dot(a, b)
Return the dot product of a and b if a and b are both 1-D arrays. If a is an N-D
array and b is a 1-D array, it is a sum product over the last axis of a and b is
returned.', 12, 15, NULL::jsonb, NULL, NULL, 'Problem 6 [12 points] Perceptron
Given the following training dataset and test dataset:
Training
Test
x1
-2
-1
x2
-2
-1
T
-1
-1
-1
-1
(a)
[2 points] Suppose we use the training dataset to train a perceptron. At some point
during the training, the weights and bias of the perceptron have not changed after four
consecutive updates, but the current epoch is not yet finished. Should we stop the train-
ing? Explain why.
Answer:
Yes, the training has converged so we should stop it from wasting time and resources.
Marking Scheme:
 1 point for stating we should stop the training.
 1 point for giving correct explanations.
(b)
[2 points]
After the training has converged, what is the range of possible accuracy on the test
dataset?
A. 50%
B. 50% ∼75%
C. 50% ∼100%
D. 25% ∼75%
E. 0% ∼100%
Answer:
D
Marking Scheme:
 2 points for the correct answer.
(c)
[2 points] Consider the following implementation of a perceptron:
import numpy as np
def fit(X, T, W0, b0, eta, max_epochs):
W = np.ones(X.shape[0]) * W0
b = b0
for epoch in range(max_epochs):
for i in range(T.shape[0]):
O = predict(X, W, b)
E = T[i] - O
W = W + E * X
b = b + E
return W, b
def predict(X, W, b):
Z = np.dot(X, W) + b
O = (Z <= 0) * 2 - 1
return O
The predict function takes three inputs:
 A 1-D or 2-D NumPy array, X, storing test example(s). The shape of X is (d,) or (n,
d) where n is the number of test examples and d is the number of features;
 A 1-D NumPy array, W, storing the weights of the perceptron. The shape of W is (d,)
where d is the number features;
 A bias value, b, for the perceptron.
Based on how the predict function is implemented, what is the activation function f(z)
of this perceptron?
Answer:
f(z) =



if z ≤0
−1
otherwise
Marking Scheme:
 2 points for giving the correct activation function.
(d)
[6 points]
The fit function takes six inputs:
 A 2-D NumPy array, X, storing training examples. The shape of X is (m, d) where
m is the number of training examples and d is the number of features;
 A 1-D NumPy array, T, storing training targets. The shape of T is (m,) where m is
the number training examples;
 An initial value, W0, for the weights of the perceptron;
 An initial value, b0, for the bias of the perceptron;
 The learning rate, eta;
 The maximum number of training epochs, max epochs.
Given these inputs, however, the fit function is not implemented correctly. Identify all
the errors by giving the line numbers that cause the errors, and propose ways to fix them.
You may find the following attribute or functions useful for this question.
 numpy.ones(shape)
Return a new array of given shape, shape, filled with ones.
 range(stop)
Return a sequence of numbers starting from 0, incrementing by 1, and ending at the
value stop.
 ndarray.shape
Tuple of array dimensions.
 numpy.dot(a, b)
Return the dot product of a and b if a and b are both 1-D arrays. If a is an N-D
array and b is a 1-D array, it is a sum product over the last axis of a and b is
returned.
Answer:
 On line 4, change X.shape[0] into X.shape[1]
 On line 8 and 10, change X into X[i]
 On line 9, multiply the right-hand side with eta
 On line 9, change T[i]-O into O-T[i], or equivalently, on line 10 and 11, change +
into -.
The resulting fit function is as follows:
def fit(X, T, W0, b0, eta, max_epochs):
W = np.ones(X.shape[1]) * W0
b = b0
for epoch in range(max_epochs):
for i in range(T.shape[0]):
O = predict(X[i], W, b)
E = eta * (O - T[i])
W = W + E * X[i]
b = b + E
return W, b
Other equivalent solutions also exist.
Marking Scheme:
 2 points for each error; 1 point for identifying the error (pointing out only the line
number does not count), and 1 point for correctly fixing the error. Partial points are
given to incomplete fixes of an error.
 For fixes that correctly fixed an error but accidentally introduced other errors, points
are partially deducted. No extra points are deducted for purely irrelevant/erroneous
modifications of the code.
 The final mark is capped at 6 points.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2023-spring-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [16 points] Multilayer Perceptron
(a)
[4 points] Consider the following dataset
x1
x2
F(x1,x2)
Given the following single layer perceptron, where activation function f is defined as
f(z) =
(
1, for z ≥0
0, otherwise
Can we classify all the data points correctly using this perceptron? If yes, give a set of
possible parameters. If no, briefly explain the reason.
(b)
[2 points] Consider the following dataset
x1
x2
F(x1,x2)
Using the same perceptron architecture in part (a), can we classify all the data points
correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly
explain the reason.
(c)
[10 points] Consider the same dataset in part (b), given the following multilayer percep-
tron, where activation function f is defined as identical with that in part (a),
can we classify all the data points correctly using this multilayer perceptron? If yes, give
a set of possible parameters. If no, briefly explain the reason.
Hint: You don’t need to use gradient descent to calculate the results.
You may try
to find a set of parameters to make the data points linear separable after the first layer
mapping.
-------------------- END OF PAPER --------------------', 16, 18, NULL::jsonb, NULL, NULL, 'Problem 7 [16 points] Multilayer Perceptron
(a)
[4 points] Consider the following dataset
x1
x2
F(x1,x2)
Given the following single layer perceptron, where activation function f is defined as
f(z) =
(
1, for z ≥0
0, otherwise
Can we classify all the data points correctly using this perceptron? If yes, give a set of
possible parameters. If no, briefly explain the reason.
Answer:
Yes, the four data points are linearly separable.
One possible set of parameters is
w1 = 1, w2 = 1, θ = −1.5.
Marking Scheme:
 1 point for ‘yes’ answer.
 3 points for correct parameter (no partial points).
(b)
[2 points] Consider the following dataset
Using the same perceptron architecture in part (a), can we classify all the data points
correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly
x1
x2
F(x1,x2)
explain the reason.
Answer:
No, the 4 data points are not linearly separable, so they can’t be fit with 100% accuracy
using single layer perceptron.
Marking Scheme:
 1 point for ‘yes’ answer.
 3 points for correct parameters (no partial points).
(c)
[10 points] Consider the same dataset in part (b), given the following multilayer percep-
tron, where activation function f is defined as identical with that in part (a),
can we classify all the data points correctly using this multilayer perceptron? If yes, give
a set of possible parameters. If no, briefly explain the reason.
Hint: You don’t need to use gradient descent to calculate the results.
You may try
to find a set of parameters to make the data points linear separable after the first layer
mapping.
Answer:
Yes, one possible set of parameters can be obtained as following procedure:
we first use two linear functions to seperate the 4 datatpoints, e.g, w1 = −1, w2 = 1, θ1 =
−0.5, w3 = −1, w4 = 1, θ2 = 0.5, the related diagram is as followed,
(1,1)
(0,1)
(x1)
(x2)
(1,0)
(0,0)
𝑤1𝑥1 + 𝑤2𝑥2 + 𝜃1
𝑤3𝑥1 + 𝑤4𝑥2 + 𝜃2
After first layer mapping, the result will be
x1
x2
y1 = w1x1 + w2x2 + θ1
y2 = w3x1 + w4x2 + θ2
f(y1)
f(y2)
F(x1, x2)
-0.5
0.5
0.5
1.5
-1
-0.5
-0.5
0.5
For last three columns of the table, we can see the datapoints are linear separable after
the first layer, where we can set w5 = −1, w6 = 1, θ3 = −0.5 to fulfill the requirement.
To sum up, one possible set of parameters are
w1 = −1, w2 = 1, θ1 = −0.5, w3 = −1, w4 = 1, θ2 = 0.5, w5 = −1, w6 = 1, θ3 = −0.5
Marking Scheme:
 1 point for ‘yes’ answer.
 6 points if first layer mapping makes all data points linear separable (3 points if
the data points are ‘almost’ linear separable, e.g., one set of parameters are correct
except that the signs are opposite).
 3 points for correct second layer parameters.
-------------------- END OF PAPER --------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2024-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [5 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 0.5 point for each correct answer.
Question
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)
Answer
(a) In the following code
import numpy as np
a = np.array([0, 1, 1, 2, 5, 5, 3])
b = np.array([0, 1, 2, 3, 4, 5])
c = (b == a.reshape(7, 1))
The array c has the shape (7, 1).
(b) After executing the following block of code:
import numpy as np
a = np.array([[1, 2], [3, 4], [5, 6]])
b = np.array([[1, 2, 3], [0, 0, 0], [1, 0, 0]])
c = a.dot(b)
The array c is array([[22, 28], [0, 0], [1, 2]]).
(c) The Na¨ıve Bayes Classifier operates under the assumption that the presence of a partic-
ular feature in a class is independent of the presence of any other feature.
(d) In Na¨ıve Bayes, we assume that P(B|e1, e2) = P(B|e1)P(B|e2) where B is our belief
and (e1, e2) are evidence.
(e) In Na¨ıve Bayes, given P(B = b), P(e1|B = b), and P(e2|B = b) for each possible belief
b, we can compute P(B = b′|e1, e2) for any b′.
(f) K-Nearest Neighbors Classifier CANNOT handle data with categorical features since
it is difficult to find the distance between categorical features.
(g) In K-Nearest Neighbors for binary classification, odd values of k are usually preferred.
(h) A 6-fold cross validation for K-nearest neighbors algorithm means that for each value
of K, we randomly select 1/6 of the training data as the validation set to evaluate the
model which is trained by the remaining (5/6) of the training data.
(i) The result of the K-Means Clustering DOES NOT depend on the initial centroids.
(j) It is possible that after new cluster centroids are computed by the K-Means Clustering
Algorithm, a cluster centroid may be associated with an empty cluster (i.e., with zero
points in it).', 5, 3, NULL::jsonb, NULL, NULL, 'Problem 1 [5 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 0.5 point for each correct answer.
Question
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)
Answer
(a) In the following code
import numpy as np
a = np.array([0, 1, 1, 2, 5, 5, 3])
b = np.array([0, 1, 2, 3, 4, 5])
c = (b == a.reshape(7, 1))
The array c has the shape (7, 1).
(b) After executing the following block of code:
import numpy as np
a = np.array([[1, 2], [3, 4], [5, 6]])
b = np.array([[1, 2, 3], [0, 0, 0], [1, 0, 0]])
c = a.dot(b)
The array c is array([[22, 28], [0, 0], [1, 2]]).
(c) The Na¨ıve Bayes Classifier operates under the assumption that the presence of a partic-
ular feature in a class is independent of the presence of any other feature.
(d) In Na¨ıve Bayes, we assume that P(B|e1, e2) = P(B|e1)P(B|e2) where B is our belief
and (e1, e2) are evidence.
(e) In Na¨ıve Bayes, given P(B = b), P(e1|B = b), and P(e2|B = b) for each possible belief
b, we can compute P(B = b′|e1, e2) for any b′.
(f) K-Nearest Neighbors Classifier CANNOT handle data with categorical features since
it is difficult to find the distance between categorical features.
(g) In K-Nearest Neighbors for binary classification, odd values of k are usually preferred.
(h) A 6-fold cross validation for K-nearest neighbors algorithm means that for each value
of K, we randomly select 1/6 of the training data as the validation set to evaluate the
model which is trained by the remaining (5/6) of the training data.
(i) The result of the K-Means Clustering DOES NOT depend on the initial centroids.
(j) It is possible that after new cluster centroids are computed by the K-Means Clustering
Algorithm, a cluster centroid may be associated with an empty cluster (i.e., with zero
points in it).
Question
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)
Answer
F
F
T
F
T
F
T
F
F
T', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'easy', '', '', ''),
    ('COMP2211-2024-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [19 points] Advanced Python for Artificial Intelligence
(a)
[13 points] Consider the following NumPy arrays:
import numpy as np
# np.arange(start, stop)
# - return an array of evenly spaced values within the half-open interval
#
[start,stop), the default step size is 1.
# np.ones(shape):
# - return a new array of given shape, filled with ones.
A = np.arange(5,15)
B = np.ones((3,2))
C = np.array([[[0,1,2,3],
[4,5,6,7]],
[[8,9,10,11],
[12,13,14,15]],
[[16,17,18,19],
[20,21,22,23]]])
Suppose the following Python statements are running consecutively. Write the output
for each of the following Python statements.
If the output is an empty array, write
“Empty Array”. If an error occurs, write “Error”.
(i) print(A[2:-2:3])
(ii) # np.ndarray.reshape(shape)
# - return an array containing the same data with a new shape.
print(B.reshape(3,-1,2))
(iii) Please write a line of Python code to get the same result in part (a)(ii) by using
np.expand dims on B.
# np.expand_dims(a, axis)
# - insert a new axis to a that will appear at the axis position in the
#
expanded array shape. Return an array that is the view of a with the
#
number of dimensions increased.
(iv) Please write a line of Python code to create the array C by using the functions
np.ndarray.reshape and np.arange().
(v) # np.mean(a, axis)
# - return a new array containing the average values of a over the specified axis.
print(np.mean(C,axis = 2))
(vi) # np.transpose(a, axes)
# - return an array with axes transposed in a.
print(np.transpose(C, (1, 0, 2)))
(vii) # a@b
# - return the matrix multiplication of the two arrays a and b.
D = A.reshape(2, 5)
print(B@D)
(viii) # np.dot(a,b)
# - return the dot product of the two arrays a and b. If both a and b are
#
1-D arrays, it is the inner product of vectors and returns a scalar.
#
If both a and b are bool arrays, the output is in bool datatype.
# np.ndarray.astype(dtype)
# - return the copy of the array with a specified dtype.
E = A < 12
F = A % 5 == 0
print(np.dot(E, F).astype(int))
(ix) print(np.dot(E, F.astype(int)))
(x) print( C / B )
(xi) # np.newaxis
# - increase the dimension of an array by adding new axis.
print(C / B[..., np.newaxis])
(xii) G = C[0, :, 2:]
print(G)
(xiii) C[0, 1, 2] = 100
print(G)
Scheme:
 1 point for giving the correct answer for each part. 13 points in total.
(b)
[6 points] In recommendation systems, we often recommend items similar to what the
user likes. This is known as content-based filtering. In content-based filtering, an item is
represented as a feature vector (or a 1D array). Given the following code which computes
the cosine similarity between feature vectors of items in explicit loops:
def compute_cosine_similarity_loops(X):
num_items, num_features = X.shape
# --- BLOCK TO REWRITE ---
X_normalized = np.zeros((num_items, num_features))
for i in range(num_items):
feature_norm = np.sqrt(np.sum(X[i] ** 2))
X_normalized[i] = X[i] / feature_norm
similarities = np.zeros((num_items, num_items))
for i in range(num_items):
for j in range(num_items):
similarities[i, j] = np.sum(X_normalized[i] * X_normalized[j])
# --- BLOCK TO REWRITE ---
return similarities
X = np.array([[0, 2], [1, -1], [1, 1]])
print(compute_cosine_similarity_loops(X))
# Output:
# [[ 1.
-0.70710678
0.70710678]
#
[-0.70710678
1.
0.
]
#
[ 0.70710678
0.
1.
]]
Rewrite the block of code between the comment lines “# --- BLOCK TO REWRITE --- ”
using no explicit loops in the space provided. You may find the following functions
useful for this question.
 Element-wise square of an array:
np.square(A)
– A is the input array
This is equivalent to A ** 2.
 Element-wise square root of an array:
np.sqrt(A)
– A is the input array
 Sum of array elements over a given axis:
np.sum(A, axis)
– A is the input array
– axis is the axis across which the array is summed
 Insert a new axis of size 1 to an array:
np.expand_dims(A, axis)
– A is the input array
– axis is the position where the axis is to be inserted
If axis is 0, this is equivalent to A[np.newaxis] and A[None]. If axis is 1, this is
equivalent to A[:, np.newaxis] and A[:, None].
 Transpose of an array:
np.transpose(A)
– A is the input array
This is equivalent to A.T.
 Matrix multiplication:
np.matmul(A, B)
– A is the left array for matrix multiplication
– B is the right array for matrix multiplication
This is equivalent to A @ B.
More information on matrix multiplication: suppose A.shape[1] == B.shape[0],
then
C = np.matmul(A, B)
means that for each i and j,
C[i, j] == np.sum(A[i] * B[:, j])
Write your code in the space below.', 19, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [19 points] Advanced Python for Artificial Intelligence
(a)
[13 points] Consider the following NumPy arrays:
import numpy as np
# np.arange(start, stop)
# - return an array of evenly spaced values within the half-open interval
#
[start,stop), the default step size is 1.
# np.ones(shape):
# - return a new array of given shape, filled with ones.
A = np.arange(5,15)
B = np.ones((3,2))
C = np.array([[[0,1,2,3],
[4,5,6,7]],
[[8,9,10,11],
[12,13,14,15]],
[[16,17,18,19],
[20,21,22,23]]])
Suppose the following Python statements are running consecutively. Write the output
for each of the following Python statements.
If the output is an empty array, write
“Empty Array”. If an error occurs, write “Error”.
(i) print(A[2:-2:3])
Answer:
[7 10]
(ii) # np.ndarray.reshape(shape)
# - return an array containing the same data with a new shape.
print(B.reshape(3,-1,2))
Answer:
[[[1 1]]
[[1 1]]
[[1 1]]]
(iii) Please write a line of Python code to get the same result in part (a)(ii) by using
np.expand dims on B.
# np.expand_dims(a, axis)
# - insert a new axis to a that will appear at the axis position in the
#
expanded array shape. Return an array that is the view of a with the
#
number of dimensions increased.
Answer:
np.expand dims(B, 1)
(also correct if adding print())
(iv) Please write a line of Python code to create the array C by using the functions
np.ndarray.reshape and np.arange().
Answer:
np.arange(24).reshape(3, 2, 4)
(also correct if adding C = )
(v) # np.mean(a, axis)
# - return a new array containing the average values of a over the specified axis.
print(np.mean(C,axis = 2))
Answer:
[[1.5 5.5]
[9.5 13.5]
[17.5 21.5]]
(vi) # np.transpose(a, axes)
# - return an array with axes transposed in a.
print(np.transpose(C, (1, 0, 2)))
Answer:
[[[0 1 2 3]
[8 9 10 11]
[16 17 18 19]]
[[4 5 6
7]
[12 13 14 15]
[20 21 22 23]]]
(vii) # a@b
# - return the matrix multiplication of the two arrays a and b.
D = A.reshape(2, 5)
print(B@D)
Answer:
[[15 17 19 21 23]
[15 17 19 21 23]
[15 17 19 21 23]]
(viii) # np.dot(a,b)
# - return the dot product of the two arrays a and b. If both a and b are
#
1-D arrays, it is the inner product of vectors and returns a scalar.
#
If both a and b are bool arrays, the output is in bool datatype.
# np.ndarray.astype(dtype)
# - return the copy of the array with a specified dtype.
E = A < 12
F = A % 5 == 0
print(np.dot(E, F).astype(int))
Answer:
(ix) print(np.dot(E, F.astype(int)))
Answer:
(x) print( C / B )
Answer:
Error
(xi) # np.newaxis
# - increase the dimension of an array by adding new axis.
print(C / B[..., np.newaxis])
Answer:
[[[0,1,2,3],
[4,5,6,7],
[[8,9,10,11],
[12,13,14,15]],
[[16,17,18,19],
[20,21,22,23]]]
(xii) G = C[0, :, 2:]
print(G)
Answer:
[[2 3]
[6 7]]
(xiii) C[0, 1, 2] = 100
print(G)
Answer:
[[2 3]
[100 7]]
Scheme:
 1 point for giving the correct answer for each part. 13 points in total.
(b)
[6 points] In recommendation systems, we often recommend items similar to what the
user likes. This is known as content-based filtering. In content-based filtering, an item is
represented as a feature vector (or a 1D array). Given the following code which computes
the cosine similarity between feature vectors of items in explicit loops:
def compute_cosine_similarity_loops(X):
num_items, num_features = X.shape
# --- BLOCK TO REWRITE ---
X_normalized = np.zeros((num_items, num_features))
for i in range(num_items):
feature_norm = np.sqrt(np.sum(X[i] ** 2))
X_normalized[i] = X[i] / feature_norm
similarities = np.zeros((num_items, num_items))
for i in range(num_items):
for j in range(num_items):
similarities[i, j] = np.sum(X_normalized[i] * X_normalized[j])
# --- BLOCK TO REWRITE ---
return similarities
X = np.array([[0, 2], [1, -1], [1, 1]])
print(compute_cosine_similarity_loops(X))
# Output:
# [[ 1.
-0.70710678
0.70710678]
#
[-0.70710678
1.
0.
]
#
[ 0.70710678
0.
1.
]]
Rewrite the block of code between the comment lines “# --- BLOCK TO REWRITE --- ”
using no explicit loops in the space provided. You may find the following functions
useful for this question.
 Element-wise square of an array:
np.square(A)
– A is the input array
This is equivalent to A ** 2.
 Element-wise square root of an array:
np.sqrt(A)
– A is the input array
 Sum of array elements over a given axis:
np.sum(A, axis)
– A is the input array
– axis is the axis across which the array is summed
 Insert a new axis of size 1 to an array:
np.expand_dims(A, axis)
– A is the input array
– axis is the position where the axis is to be inserted
If axis is 0, this is equivalent to A[np.newaxis] and A[None]. If axis is 1, this is
equivalent to A[:, np.newaxis] and A[:, None].
 Transpose of an array:
np.transpose(A)
– A is the input array
This is equivalent to A.T.
 Matrix multiplication:
np.matmul(A, B)
– A is the left array for matrix multiplication
– B is the right array for matrix multiplication
This is equivalent to A @ B.
More information on matrix multiplication: suppose A.shape[1] == B.shape[0],
then
C = np.matmul(A, B)
means that for each i and j,
C[i, j] == np.sum(A[i] * B[:, j])
Write your code in the space below.
Answer:
X_norm = np.sqrt(np.sum(X ** 2, 1))
# (n,)
# 2 points
X_normalized = X / X_norm[:, None]
# (n, d) # 2 points
similarities = X_normalized @ X_normalized.T
# (n, n) # 2 points
The solution is not unique.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2024-spring-midterm', '3', NULL, 3, 'long_question', 'coding', 'Problem 3 [16 points] Model Evaluation and Advanced Python Programming
In this problem, you need to implement the evaluation metrics for multi-class classifiers.
Specifically, you need to implement the confusion matrix, accuracy, precision, recall, and
macro F1 score. We provide the related definitions and formulas as follows.
(a)
[5.5 points] Suppose there is a test dataset consisting of 10 data points, their actual
classes are [2, 1, 1, 2, 0, 1, 0, 0, 1, 1], and their predicted classes by a classifier model
are [2, 1, 2, 1, 0, 2, 1, 0, 0, 0]. What is the confusion matrix for the prediction results?
What are TP, TN, FP, FN for each class?
Confusion matrix: a table that summarized actual labels and the predictions of classifi-
cation. For multi-class classification, the confusion matrix has the shape (num classes,
num classes) that records the number of occurrences between actual labels and the pre-
dictions. The classes are listed in the same order in the rows as in the columns, therefore
the correctly classified elements are located on the main diagonal.
For each class i, TPi, TNi, FPi, FNi represent the instance numbers of true positive,
true negative, false positive, and false negative, respectively.
 True positive: A test result where the classifier correctly predicts the positive class
as positive.
 True negative: A test result where the classifier correctly predicts the negative class
as negative.
 False positive: A test result where the classifier incorrectly predicts the negative class
as positive.
 False negative: A test result where the classifier incorrectly predicts the positive class
as negative.
Please fill in the confusion matrix (the rows represent actual class and the columns
represent predicted class) and the TP, TN, FP, FN table.
Predicted
Actual
Class
Class
TP
TN
FP
FN
(b)
[1 point] What is the accuracy score of the classifier model on the above test data?
Accuracy =
PN
i=1 TPi
num testdata, where N is the number of classes.
(c)
[2.5 points] What are the precisions, recalls, and F1 scores for each class of the classifier
model on the above test data? Please fill in the table using fractions or keep 3 decimals.
For each class i, Precisioni =
TPi
TPi+FPi
For each class i, Recalli =
TPi
TPi+FNi
For each class i, F1i = 2·Precisioni·Recalli
Precisioni+Recalli
Class
Precision
Recall
F1 score
(d)
[1 point] What is the macro F1 score of the classifier model on the above test data?
Please keep 3 decimals in your answer.
The macro F1 score is the unweighted mean of the F1 scores of all classes:
Macro-F1 =
PN
i=1 F1i
N
, where N is the number of classes.
(e) [6 points] Given two NumPy 1D arrays with the same shape (num_testdata,): test_actual
and test_predict, representing the actual class labels and predicted class labels for the
test data, please implement the following functions by filling in the blanks. For each
TODO, please use a one-line Python expression.
import numpy as np
def generate_confusion_matrix(test_actual, test_predict):
# TODO 1: Get num_classes, the number of classes in the test data.
# Note that the classes in the test_actual and test_predict are represented
# in integer indices from [0, 1, ..., num_classes - 1].
num_classes = _________________________________________
confusion_matrix = np.zeros((num_classes, num_classes))
# TODO 2: Get the values of confusion_matrix, where the rows represent
# actual class and the columns represent predicted class.
for i in range(0, num_classes):
for j in range(0, num_classes):
confusion_matrix[i, j] = _____________________________________
return confusion_matrix
def calculate_evaluation_metrics(test_actual, test_predict):
confusion_matrix = generate_confusion_matrix(test_actual, test_predict)
# TODO 3: Get the accuracy score, which is a scalar value.
accuracy = ______________________________________________________
# TODO 4: Get the precisions for all classes, which is a 1D array
# with shape (num_classes, ).
precision = _____________________________________________________
# TODO 5: Get the recalls for all classes, which is a 1D array
# with shape (num_classes, ).
recall = ________________________________________________________
# TODO 6: Get the macro F1 score, which is a scalar value.
macro_f1 = ______________________________________________________
return accuracy, precision, recall, macro_f1
Note:
 An expression is a combination of values, variables, operators, and calls to functions.
 There must be no explicit loops in your expression.
 Your implemented functions should work with any number of test data points and
any number of classes.
 You cannot use any variable that is not defined inside the function or any global
variable.
You may find the following attribute or functions useful for this problem.
 np.max(a, axis = None)
- return the maximum of the array a along the given axis. If axis is None, the result
is a scalar value.
 np.ndarray.sum(axis = None)
- return the sum of the array over the given axis. If axis is None, the result is a
scalar value.
 np.ndarray.diagonal()
- if the array is 2D, then a 1D array containing the diagonal elements is returned.
 np.mean(a, axis)
- return a new array containing the average values of a over the specified axis. If
axis is None, the result is a scalar value.
Write your code in the space below.', 16, 10, NULL::jsonb, NULL, NULL, 'Problem 3 [16 points] Model Evaluation and Advanced Python Programming
In this problem, you need to implement the evaluation metrics for multi-class classifiers.
Specifically, you need to implement the confusion matrix, accuracy, precision, recall, and
macro F1 score. We provide the related definitions and formulas as follows.
(a)
[5.5 points] Suppose there is a test dataset consisting of 10 data points, their actual
classes are [2, 1, 1, 2, 0, 1, 0, 0, 1, 1], and their predicted classes by a classifier model
are [2, 1, 2, 1, 0, 2, 1, 0, 0, 0]. What is the confusion matrix for the prediction results?
What are TP, TN, FP, FN for each class?
Confusion matrix: a table that summarized actual labels and the predictions of classifi-
cation. For multi-class classification, the confusion matrix has the shape (num classes,
num classes) that records the number of occurrences between actual labels and the pre-
dictions. The classes are listed in the same order in the rows as in the columns, therefore
the correctly classified elements are located on the main diagonal.
For each class i, TPi, TNi, FPi, FNi represent the instance numbers of true positive,
true negative, false positive, and false negative, respectively.
 True positive: A test result where the classifier correctly predicts the positive class
as positive.
 True negative: A test result where the classifier correctly predicts the negative class
as negative.
 False positive: A test result where the classifier incorrectly predicts the negative class
as positive.
 False negative: A test result where the classifier incorrectly predicts the positive class
as negative.
Please fill in the confusion matrix (the rows represent actual class and the columns
represent predicted class) and the TP, TN, FP, FN table.
Predicted
Actual
Class
Class
TP
TN
FP
FN
Answer:
Predicted
Actual
Class
Class
TP
TN
FP
FN
Scheme:
 0.25 point for each correct numeric value. 5.25 points in total.
 An extra 0.25 point is given for those who gave all the correct numeric values.
(b)
[1 point] What is the accuracy score of the classifier model on the above test data?
Accuracy =
PN
i=1 TPi
num testdata, where N is the number of classes.
Answer:
0.4
Scheme:
 1 point for the correct answer.
(c)
[2.5 points] What are the precisions, recalls, and F1 scores for each class of the classifier
model on the above test data? Please fill in the table using fractions or keep 3 decimals.
For each class i, Precisioni =
TPi
TPi+FPi
For each class i, Recalli =
TPi
TPi+FNi
For each class i, F1i = 2·Precisioni·Recalli
Precisioni+Recalli
Class
Precision
Recall
F1 score
Answer:
Class
Precision
Recall
F1 score
1/2
2/3
4/7
1/3
1/5
1/4
1/3
1/2
2/5
Scheme:
 0.25 point for each correct numeric value (or fraction). 2.25 points in total.
 An extra 0.25 point is given for those who gave all the correct numeric values (or
fractions).
(d)
[1 point] What is the macro F1 score of the classifier model on the above test data?
Please keep 3 decimals in your answer.
The macro F1 score is the unweighted mean of the F1 scores of all classes:
Macro-F1 =
PN
i=1 F1i
N
, where N is the number of classes.
Answer:
0.407
Scheme:
 1 point for the correct answer.
(e) [6 points] Given two NumPy 1D arrays with the same shape (num_testdata,): test_actual
and test_predict, representing the actual class labels and predicted class labels for the
test data, please implement the following functions by filling in the blanks. For each
TODO, please use a one-line Python expression.
import numpy as np
def generate_confusion_matrix(test_actual, test_predict):
# TODO 1: Get num_classes, the number of classes in the test data.
# Note that the classes in the test_actual and test_predict are represented
# in integer indices from [0, 1, ..., num_classes - 1].
num_classes = _________________________________________
confusion_matrix = np.zeros((num_classes, num_classes))
# TODO 2: Get the values of confusion_matrix, where the rows represent
# actual class and the columns represent predicted class.
for i in range(0, num_classes):
for j in range(0, num_classes):
confusion_matrix[i, j] = _____________________________________
return confusion_matrix
def calculate_evaluation_metrics(test_actual, test_predict):
confusion_matrix = generate_confusion_matrix(test_actual, test_predict)
# TODO 3: Get the accuracy score, which is a scalar value.
accuracy = ______________________________________________________
# TODO 4: Get the precisions for all classes, which is a 1D array
# with shape (num_classes, ).
precision = _____________________________________________________
# TODO 5: Get the recalls for all classes, which is a 1D array
# with shape (num_classes, ).
recall = ________________________________________________________
# TODO 6: Get the macro F1 score, which is a scalar value.
macro_f1 = ______________________________________________________
return accuracy, precision, recall, macro_f1
Note:
 An expression is a combination of values, variables, operators, and calls to functions.
 There must be no explicit loops in your expression.
 Your implemented functions should work with any number of test data points and
any number of classes.
 You cannot use any variable that is not defined inside the function or any global
variable.
You may find the following attribute or functions useful for this problem.
 np.max(a, axis = None)
- return the maximum of the array a along the given axis. If axis is None, the result
is a scalar value.
 np.ndarray.sum(axis = None)
- return the sum of the array over the given axis. If axis is None, the result is a
scalar value.
 np.ndarray.diagonal()
- if the array is 2D, then a 1D array containing the diagonal elements is returned.
 np.mean(a, axis)
- return a new array containing the average values of a over the specified axis. If
axis is None, the result is a scalar value.
Write your code in the space below. Answer:
TODO 1: np.max(test actual) + 1 or np.max(test predict) + 1
TODO 2: (test actual == 1) & (test predict == j)).sum()
TODO 3: confusion matrix.diagonal().sum() / confusion matrix.sum()
TODO 4: confusion matrix.diagonal() / confusion matrix.sum(axis = 0)
TODO 5: confusion matrix.diagonal() / confusion matrix.sum(axis = 1)
TODO 6: np.mean(2 * precision * recall) / (precision + recall))
Scheme:
 1 point for each TODO. 6 points in total.', ARRAY['Evaluation and Validation']::TEXT[], 'Evaluation and Validation', 'Evaluation and Validation', ARRAY['Evaluation and Validation']::TEXT[], ARRAY['metric_computation', 'experimental_design', 'reasoning']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2024-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [16 points] Na¨ıve Bayes Classifier
Based on the given training data in the table below, which includes both numerical and
categorical attributes, make a prediction about the degree classification (DC) (i.e., First-Class
Honors or Second-Class Honors D1 or Second-Class Honors D2, or Third-Class Honors) of
the student with the following attribute values using Na¨ıve Bayes classifier.
 Study Attitude (SA): Serious
 Part-time Job (PTJ): No
 Average Energy Level (AEL): 7.2
 Courses Taught by Desmond and Pearl (CDP): Yes
 Number of Friends in the Study Group (NFSG): 4
Student
Study
Attitude
(SA)
Categorical
Part-time
Job
(PTJ)
Categorical
Average
Energy Level
(AEL)
Numerical
Courses
Taught by
Desmond
and Pearl
(CDP)
Categorical
Number of
Friends in the
Study Group
(NFSG)
Categorical
Degree
Classification
(DC)
Serious
No
8.5
Yes
First
Moderate
Yes
6.2
No
Second D1
Serious
No
7.8
Yes
Second D1
Moderate
Yes
5.9
No
Third
Serious
No
8.1
Yes
First
Casual
Yes
6.5
Yes
Second D2
Serious
No
7.3
No
Second D1
Serious
No
7.9
Yes
First
Moderate
Yes
5.7
No
Third
Serious
No
8.2
Yes
First
Casual
Yes
6.1
Yes
Second D2
Serious
No
7.6
No
Second D2
Moderate
Yes
6.4
Yes
Third
Serious
No
8.0
Yes
First
Casual
Yes
5.8
No
Second D2
Serious
No
7.5
Yes
Second D1
Serious
No
7.7
Yes
First
Moderate
Yes
6.0
No
Third
Serious
No
7.8
Yes
First
Casual
Yes
6.3
Yes
Second D2
Assume each categorical attribute has the following possible values:
 Study Attitude: Serious, Moderate, Casual
 Part-time Job: Yes, No
 Courses Taught by Desmond and Pearl: Yes, No
 Number of Friends in the Study Group: 1, 2, 3, 4, 5, 6
Assume that the numerical data follow a Gaussian distribution:
f(x) =
σ
√
2πexp

−(x −µ)2
2σ2

where
numerical training data = (x1, x2, . . . , xn)
µ = 1
n
n
X
i=1
xi,
σ =
v
u
u
t
n −1
n
X
i=1
(xi −µ)2
If needed, apply 1-Laplace Smoothing to the likelihood probabilities of the affected feature
only. The affected feature means that the categorical feature has a category given some
belief in the test dataset, which was NOT observed in the training dataset. Please provide
all the steps.
You may find the following equation useful for this question:
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) · · · P(ed|Bi))
.(a)
[4 points] Calculate the mean (µ) and standard deviation (σ) of the Average Energy
Level (AEL) of Degree Classification: First-Class Honors and Second-Class Honors D1.
(b)
[3 points] Calculate the test data sample’s likelihoods of Average Energy Level (AEL)
of First-Class Honors and Second-Class Honors D1.
(c) [4 points] Calculate the test data sample’s likelihoods of Study Attitude (SA), Part-time
Job (PTJ), Courses Taught by Desmond and Pearl (CDP), and Number of Friends in
the Study Group (NFSG) of all Degree Classification(s) (DC).
(d)
[2 points] Calculate the prior probabilities.
(e)
[3 points] Finally, calculate the posterior probabilities and make the prediction.
Assume that the likelihood of Average Energy Level (AEL) of Second-Class Honors D2
and Third-Class Honors are:
 P(AEL=7.2|Second-Class Honors D2) = 0.32515
 P(AEL=7.2|Third-Class Honors) = 0.000334158', 16, 14, NULL::jsonb, NULL, NULL, 'Problem 4 [16 points] Na¨ıve Bayes Classifier
Based on the given training data in the table below, which includes both numerical and
categorical attributes, make a prediction about the degree classification (DC) (i.e., First-Class
Honors or Second-Class Honors D1 or Second-Class Honors D2, or Third-Class Honors) of
the student with the following attribute values using Na¨ıve Bayes classifier.
 Study Attitude (SA): Serious
 Part-time Job (PTJ): No
 Average Energy Level (AEL): 7.2
 Courses Taught by Desmond and Pearl (CDP): Yes
 Number of Friends in the Study Group (NFSG): 4
Student
Study
Attitude
(SA)
Categorical
Part-time
Job
(PTJ)
Categorical
Average
Energy Level
(AEL)
Numerical
Courses
Taught by
Desmond
and Pearl
(CDP)
Categorical
Number of
Friends in the
Study Group
(NFSG)
Categorical
Degree
Classification
(DC)
Serious
No
8.5
Yes
First
Moderate
Yes
6.2
No
Second D1
Serious
No
7.8
Yes
Second D1
Moderate
Yes
5.9
No
Third
Serious
No
8.1
Yes
First
Casual
Yes
6.5
Yes
Second D2
Serious
No
7.3
No
Second D1
Serious
No
7.9
Yes
First
Moderate
Yes
5.7
No
Third
Serious
No
8.2
Yes
First
Casual
Yes
6.1
Yes
Second D2
Serious
No
7.6
No
Second D2
Moderate
Yes
6.4
Yes
Third
Serious
No
8.0
Yes
First
Casual
Yes
5.8
No
Second D2
Serious
No
7.5
Yes
Second D1
Serious
No
7.7
Yes
First
Moderate
Yes
6.0
No
Third
Serious
No
7.8
Yes
First
Casual
Yes
6.3
Yes
Second D2
Assume each categorical attribute has the following possible values:
 Study Attitude: Serious, Moderate, Casual
 Part-time Job: Yes, No
 Courses Taught by Desmond and Pearl: Yes, No
 Number of Friends in the Study Group: 1, 2, 3, 4, 5, 6
Assume that the numerical data follow a Gaussian distribution:
f(x) =
σ
√
2πexp

−(x −µ)2
2σ2

where
numerical training data = (x1, x2, . . . , xn)
µ = 1
n
n
X
i=1
xi,
σ =
v
u
u
t
n −1
n
X
i=1
(xi −µ)2
If needed, apply 1-Laplace Smoothing to the likelihood probabilities of the affected feature
only. The affected feature means that the categorical feature has a category given some
belief in the test dataset, which was NOT observed in the training dataset. Please provide
all the steps.
You may find the following equation useful for this question:
BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) · · · P(ed|Bi))
.(a)
[4 points] Calculate the mean (µ) and standard deviation (σ) of the Average Energy
Level (AEL) of Degree Classification: First-Class Honors and Second-Class Honors D1.
Answer:
 Mean of average energy level given first-class honors = 8.02857
 Standard deviation of average energy level given first-class honors = 0.26904
 Mean of average energy level given second-class honors, D1 = 7.2
 Standard deviation of average energy level given second-class honors, D1 = 0.69761
Scheme:
 1 point for each correct mean value. 2 points in total.
 1 point for each correct standard deviation value. 2 points in totals.
(b)
[3 points] Calculate the test data sample’s likelihoods of Average Energy Level (AEL)
of First-Class Honors and Second-Class Honors D1.
Answer:
P(AEL = 7.2|First) =
0.26904
√
2πexp

−(7.2 −8.02857)2
2(0.26904)2

= 0.01293
P(AEL = 7.2|SecondD1) =
0.69761
√
2πexp

−(7.2 −7.2)2
2(0.69761)2

= 0.57187
Scheme:
 1.5 points for each correct test data sample’s likelihood. 3 points in total.
(c) [4 points] Calculate the test data sample’s likelihoods of Study Attitude (SA), Part-time
Job (PTJ), Courses Taught by Desmond and Pearl (CDP), and Number of Friends in
the Study Group (NFSG) of all Degree Classification(s) (DC).
Answer:
 As P(SA = Serious|Third) = 0, we need to apply a add-one-trick for the study
attitude. We assume that three values (Serious, Moderate, Casual) are equally prob-
able:
– P(SA = Serious|First) = 7+1
7+3 = 0.8
– P(SA = Serious|SecondD1) = 3+1
4+3 = 0.57
– P(SA = Serious|SecondD2) = 1+1
5+3 = 0.25
– P(SA = Serious|Third) = 0+1
4+3 = 0.1429
 As P(PTJ = No|Third) = 0, we need to apply a add-one-trick for the Part-time
Job. We assume that three values (Yes, No) are equally probable:
– P(PTJ = No|First) = 7+1
7+2 = 0.89
– P(PTJ = No|SecondD1) = 3+1
4+2 = 0.67
– P(PTJ = No|SecondD2) = 1+1
5+2 = 0.286
– P(PTJ = No|Third) = 0+1
4+2 = 0.167
 P(CDP = Y es|First) = 7
7 = 1
 P(CDP = Y es|SecondD1) = 2
4 = 0.5
 P(CDP = Y es|SecondD2) = 3
5 = 0.6
 P(CDP = Y es|Third) = 1
4 = 0.25
 As P(NFSG = 4|First) = 0, we need to apply a add-one-trick for the Number
of friends in the Study Group. We assume that six values (1,2,3,4,5,6) are equally
probable:
– P(NFSG = 4|First) = 0+1
7+6 = 0.0769
– P(NFSG = 4|SecondD1) = 3+1
4+6 = 0.4
– P(NFSG = 4|SecondD2) = 1+1
5+6 = 0.18
– P(NFSG = 4|Third) = 0+1
4+6 = 0.1
Scheme:
 0.25 point for each correct test data sample’s likelihood. 4 points in total.
(d)
[2 points] Calculate the prior probabilities.
Answer:
 P(First) = 7
20 = 0.35
 P(SecondD1) = 4
20 = 0.2
 P(SecondD2) = 5
20 = 0.25
 P(Third) = 4
20 = 0.2
Scheme:
 0.5 point for each correct prior probability. 2 points in total.
(e)
[3 points] Finally, calculate the posterior probabilities and make the prediction.
Assume that the likelihood of Average Energy Level (AEL) of Second-Class Honors D2
and Third-Class Honors are:
 P(AEL=7.2|Second-Class Honors D2) = 0.32515
 P(AEL=7.2|Third-Class Honors) = 0.000334158
Answer:
 P(First|E) = (0.35)(0.8)(0.89)(0.01293)(1)(0.0769)
P(E)
= 0.0002478
P(E)
 P(SecondD1|E) = (0.2)(0.57)(0.67)(0.57187)(0.5)(0.4)
P(E)
= 0.0087359
P(E)
 P(SecondD2|E) = (0.25)(0.25)(0.286)(0.32515)(0.6)(0.18)
P(E)
= 0.0006277
P(E)
 P(Third|E) = (0.2)(0.1429)(0.167)(0.000334158)(0.25)(0.1)
P(E)
= 3.9872×10−8
P(E)
Therefore, the Na¨ıve Bayes classifier predicts “Degree Classification” = Second-Class
Honors D2 for the student.
Scheme:
 0.5 point for each correct posterior probability. 2 points in total.
 1 point for making the correct prediction.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2024-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 Part(b) Continued
(c)
[2 points] Based on the chosen K and those 6 training samples in part (b), calculate the
test error for the test dataset below.
Note:
 When selecting a neighbor, to resolve ties, choose the neighbor with the lowest index.
 When evaluating the error, use
– Number of wrong predictions / Number of test data points
.
Attribute 1
Attribute 2
Class
3.4
B
2.8
A
3.5
A
2.5
B', 18, 19, NULL::jsonb, NULL, NULL, 'Problem 5 Part(b) Continued
(c)
[2 points] Based on the chosen K and those 6 training samples in part (b), calculate the
test error for the test dataset below.
Note:
 When selecting a neighbor, to resolve ties, choose the neighbor with the lowest index.
 When evaluating the error, use
– Number of wrong predictions / Number of test data points
.
Attribute 1
Attribute 2
Class
3.4
B
2.8
A
3.5
A
2.5
B
Answer:
Using K=3, test error is 0.
Details:
**Test Data**
([3.4, 3.0], B) k nearest neighbors, [[3.2, 2.9], B, [2.7, 3.0], A, [3.4, 3.8], B] predict label B error: 0
([3.0, 2.8], A) k nearest neighbors, [[3.2, 2.9], B, [2.7, 3.0], A, [2.6, 2.0], A] predict label A error: 0
([2.0, 3.5], A) k nearest neighbors, [[2.5, 3.7], A, [2.7, 3.0], A, [3.4, 3.8], B] predict label A error: 0
([2.5, 7.0], B) k nearest neighbors, [[2.5, 3.7], A, [3.5, 4.0], B, [3.4, 3.8], B] predict label B error: 0
total error for test data: 0/4
Scheme:
 2 points for giving the total error for 6-cross validation for K = 3.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2024-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [16 points] Leader Clustering
Consider the following cluster method called Leader Clustering. It receives two parameters:
an integer K and a real-value threshold T. Similar to K-means clustering, it starts by selecting
K instance (which will be called leaders) and assigns each training instance to the cluster of
the closest leader. During this assignment step, however, if the distance of a training instance
to its closest leader is greater than the input threshold T, then this training instance forms a
new cluster and becomes the initial leader of this new cluster. After all the training instances
have been assigned to a cluster, the leader of each cluster is updated as the mean of the
cluster. The process is then repeated until the cluster assignments do not change.
(a)
[6 points] Given a 1-dimensional data set { 1, 3, 5, 9, 11, 13, 15 }, use the Leader
Clustering algorithm and Euclidean distance to cluster the given points in the data set
into 2 clusters. Assume c1 = 3 and c2 = 11 are chosen as the initial K = 2 leaders
and the threshold for forming new clusters T = 5. Fill in the following table of the first
assignment iteration with your completed values and what are the leaders after the first
assignment iteration? (If new clusters are formed in the process, named their leaders as
c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s)
are not formed yet.)
Data
point
Distance
between the
data point and c1
Distance
between the
data point and c2
Distance
between the
data point and c3
(if needed)
Distance
between the
data point and c4
(if needed)
Closest
Centroid
The leaders after the first assignment iteration:
(b) [6 points] Given a 1-dimensional data set {5, 9, 11, 13, 17, 19}, use the Leader Clustering
algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters.
Assume c1 = 5 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for
forming new clusters T = 5. Fill in the following table of the first assignment iteration
with your computed values and what are the leaders after the first assignment iteration?
(If new clusters are formed in the process, named their leaders as c3, c4, ... based on the
order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.)
Data
point
Distance
between the
data point and c1
Distance
between the
data point and c2
Distance
between the
data point and c3
(if needed)
Distance
between the
data point and c4
(if needed)
Closest
Centroid
The leaders after the first assignment iteration:
(c)
[2 points] Which of the two methods, K-Means Clustering or Leader Clustering, will be
better at dealing with outliers? Please briefly explain.
(d)
[2 points] During lectures, we have learned that one drawback of the K-Means Cluster-
ing algorithm is that we need to specify K for the algorithm, but usually, we don’t know
how many clusters there should be for an unlabeled dataset. Will the Leader Clustering
mitigate this drawback? Will there be any related limitations of the Leader Clustering
algorithm? Please briefly explain.', 16, 21, NULL::jsonb, NULL, NULL, 'Problem 6 [16 points] Leader Clustering
Consider the following cluster method called Leader Clustering. It receives two parameters:
an integer K and a real-value threshold T. Similar to K-means clustering, it starts by selecting
K instance (which will be called leaders) and assigns each training instance to the cluster of
the closest leader. During this assignment step, however, if the distance of a training instance
to its closest leader is greater than the input threshold T, then this training instance forms a
new cluster and becomes the initial leader of this new cluster. After all the training instances
have been assigned to a cluster, the leader of each cluster is updated as the mean of the
cluster. The process is then repeated until the cluster assignments do not change.
(a)
[6 points] Given a 1-dimensional data set { 1, 3, 5, 9, 11, 13, 15 }, use the Leader
Clustering algorithm and Euclidean distance to cluster the given points in the data set
into 2 clusters. Assume c1 = 3 and c2 = 11 are chosen as the initial K = 2 leaders
and the threshold for forming new clusters T = 5. Fill in the following table of the first
assignment iteration with your completed values and what are the leaders after the first
assignment iteration? (If new clusters are formed in the process, named their leaders as
c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s)
are not formed yet.)
Data
point
Distance
between the
data point and c1
Distance
between the
data point and c2
Distance
between the
data point and c3
(if needed)
Distance
between the
data point and c4
(if needed)
Closest
Centroid
The leaders after the first assignment iteration:
Answer:
Data
point
Distance
between the
data point and c1
Distance
between the
data point and c2
Distance
between the
data point and c3
(if needed)
Distance
between the
data point and c4
(if needed)
Closest
Centroid
c1
c1
c1
c2
c2
c2
c2
After the first assignment iteration, c1 = (1+3+5)/3 = 3, c2 = (9+11+13+15)/4 = 12.
Scheme:
 0.25 point for each correct numeric value (or label). 5.25 points in total.
 0.25 point for the correct values of the leaders.
If the values of both leader are correct, 0.75. If only one leader is correct,
0.25.
(b) [6 points] Given a 1-dimensional data set {5, 9, 11, 13, 17, 19}, use the Leader Clustering
algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters.
Assume c1 = 5 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for
forming new clusters T = 5. Fill in the following table of the first assignment iteration
with your computed values and what are the leaders after the first assignment iteration?
(If new clusters are formed in the process, named their leaders as c3, c4, ... based on the
order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.)
Data
point
Distance
between the
data point and c1
Distance
between the
data point and c2
Distance
between the
data point and c3
(if needed)
Distance
between the
data point and c4
(if needed)
Closest
Centroid
The leaders after the first assignment iteration:
Answer:
Data
point
Distance
between the
data point and c1
Distance
between the
data point and c2
Distance
between the
data point and c3
(if needed)
Distance
between the
data point and c4
(if needed)
Closest
Centroid
c1
c2
c2
c2
c3
c3
After the first assignment iteration, c1 = 5, c2 = (9+11+13)/3 = 11, c3 = (17+19)/2 =
18.
Scheme:
 0.25 point for each correct numeric value (or label). 5 points in total.
 1 point for the correct values of the leaders. 1 point in total.
If there are any incorrect leaders in the answer, 0 point
(c)
[2 points] Which of the two methods, K-Means Clustering or Leader Clustering, will be
better at dealing with outliers? Please briefly explain.
Answer:
Leader clustering is more robust (better at dealing with) outliers. This is because new
cluster will be generated and outliers will be assigned to the new cluster without influ-
encing the other clusters.
Scheme:
 1 point for stating which method is better at dealing with outliers.
 1 point for the explanation.
(d)
[2 points] During lectures, we have learned that one drawback of the K-Means Cluster-
ing algorithm is that we need to specify K for the algorithm, but usually, we don’t know
how many clusters there should be for an unlabeled dataset. Will the Leader Clustering
mitigate this drawback? Will there be any related limitations of the Leader Clustering
algorithm? Please briefly explain.
Answer:
The Leader Clustering can mitigate this drawback of specifying K because the algorithm
may get increment on the number of clusters during training. But now the limitation is
that we have to specify T for the Leader Clustering algorithm. The threshold value will
influence whether the algorithm increases new leader (new cluster) or not. The clustering
results will be sensitive to T.
Scheme:
 1 point for stating whether the Leader Clustering mitigate the drawback.
 1 point for the explanation.
Explanations including:
1. More computation/complexity
2. Sensitive to the initial choice of leaders
3. Cannot control the number of clusters
will not get any point.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2024-spring-midterm', '7', NULL, 7, 'long_question', 'coding', 'Problem 7 Continued:
-------------------- END OF PAPER
--------------------
/* Rough work */
/* Rough work */
/* Rough work */
/* Rough work */', 10, 24, NULL::jsonb, NULL, NULL, 'Problem 7 Continued:
Answer:
D = 2, average accuracy = 0%: In this case, the dataset was divided into two folds. Since
the samples from each class were grouped together, one fold contained samples from classes
0 and 1, and the other fold contained samples from classes 2 and 3. As a result, the classifier
failed to correctly classify any of the samples because it was not trained on the classes present
in the test fold.
D = 3, average accuracy = 50%: In this case, the dataset was divided into three folds.
Fold 1 contained samples from classes 0 and 1, fold 2 contained samples from classes 1 and 2,
and fold 3 contained samples from classes 2 and 3. Using fold 1 and fold 2 as training, and
fold 3 as testing, only 25% of test points are classified correctly.
 Training: Fold 1, 2, Testing: Fold 3, Accuracy = 25%
 Training: Fold 1, 3, Testing: Fold 2, Accuracy = 100%
 Training: Fold 2, 3, Testing: Fold 1, Accuracy = 25%
So, the average accuracy is (25% + 100% + 25%)/3 = 50%.
D = 5, accuracy = 100%: In this case, the dataset was divided into five folds.
Fold 1
contained samples from class 0, fold 2 contained samples from class 0 and 1, fold 3 contained
samples from class 1 and 2, fold 4 contained samples from class 2 and 3, and fold 5 contained
samples from class 3.
 Training: Fold 1, 2, 3, 4, Testing: Fold 5, Accuracy: 100%
 Training: Fold 1, 2, 3, 5, Testing: Fold 4, Accuracy: 100%
 Training: Fold 1, 2, 4, 5, Testing: Fold 3, Accuracy: 100%
 Training: Fold 1, 3, 4, 5, Testing: Fold 2, Accuracy: 100%
 Training: Fold 2, 3, 4, 5, Testing: Fold 1, Accuracy: 100%
So, the average accuracy is (100% + 100% + 100% + 100% + 100%)/5 = 100%.
To improve the implementation of D-fold cross-validation, the dataset should be shuffled
before dividing it into folds. This will ensure that each fold contains a representative dis-
tribution of samples from all classes, reducing bias and providing more reliable evaluation
results.
Scheme:
 2 points for explaining the result obtained for D = 2.
 3 points for explaining the result obtained for D = 3.
 3 points for explaining the result obtained for D = 5.
 2 points for suggesting improvement(s) to the implementation.
-------------------- END OF PAPER
--------------------
/* Rough work */
/* Rough work */
/* Rough work */
/* Rough work */', ARRAY['Evaluation and Validation']::TEXT[], 'Evaluation and Validation', 'Evaluation and Validation', ARRAY['Evaluation and Validation']::TEXT[], ARRAY['metric_computation', 'experimental_design', 'reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table in the answer book. You get 1 point for each correct answer.
(a) The console output of the following Python code is [1,2,3,4,5,6,7].
import numpy as np
# arange(start, stop): Values are generated within half-open interval [start,stop).
array = np.arange(1,10)
print(array[:-2])
(b) There is no way for the Na¨ıve Bayes classifier to make a prediction if a categorical feature
(e.g., color) has a new category (e.g., blue) not observed in the training data set.
(c) K-Nearest Neighbors classifier is a non-parametric machine learning algorithm with an
assumption that the data are uniformly distributed.
(d) It is possible for K-Means Clustering to return empty clusters if certain initial centroid
positions are unfortunate.
(e) If there are only two classes to predict, the following Multi-layer Perceptron (MLP)
models will have the same output, given that they have the same initial weights and
biases, and are trained in the same manner:
 MLP 1: Input layer, hidden dense layer with ReLU activation function, output layer
with sigmoid activation function.
 MLP 2: Input layer, hidden dense layer with ReLU activation function, output layer
with softmax activation function.
(f) An affine transformation may preserve distances and angles.
(g) The number of floating point multiplications involved when a 32×32 pixel RGB image
is passed through a 2D convolutional layer with 8 3×3 kernels, padding of 3, and stride
length of 1 is 8×3×3×36×36.
(h) Setting the dropout rate of a Convolutional Neural Network to 0.5 means that more than
50% of its layer’s outputs are non-zero.
(i) Alpha-beta pruning can sometimes change the final decision made by the minimax algo-
rithm, resulting in a different move being selected for the current player.
(j) Researches that involves human participants should require informed consent.', 10, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions
Indicate whether the following statements are true or false by putting T or F in the given
table. You get 1 point for each correct answer.
Question
(a)
(b)
(c)
(d)
(e)
(f)
(g)
(h)
(i)
(j)
Answer
T
F
F
T
F
T
F
F
F
T
Scheme:
 1 point for giving each correct answer. 10 points in total.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [10 points] Advanced Python: Image Processing with NumPy
You are working on an image processing project that involves manipulating arrays to perform
various operations on images. You have been provided with a NumPy array img representing
a grayscale image of size 512 × 512:
import numpy as np
img = np.random.randint(0, 256, size=(512, 512), dtype=np.uint8)
# Note: random.randint(low, high=None, size=None, dtype=int)
# returns random integers from low (inclusive) to high (exclusive).
(a)
[4 points] Image masking:
Implement the following function which creates a new array img masked by applying a
mask to img. The mask is defined as a 2D array mask of size 512 × 512, where mask[i,
j] = 1 if the pixel at img[i, j] is within a circle (boundary included) of radius 100
centered at a given position [center x, center y], and mask[i, j] = 0 otherwise.
def apply_circle_mask(img, center_x, center_y):
# --- YOUR CODE HERE ---
return img_masked
Below is an example for center x = 100 and center y = 200.
img
img_masked
Do NOT use any explicit loop to implement the function. You may find the following
functions useful for this question.
 np.arange(start, stop, step) returns spaced values within a given interval.
– start (optional) is the start of interval. The default start value is 0.
– stop is the end of interval. The returned interval does not include this value.
– step (optional) is the spacing between values. The default spacing is 1.
 np.square(a) returns the element-wise square of an array.
– a is the input array.
This is equivalent to a ** 2.
 np.sqrt(a) returns the element-wise square root of an array.
– a is the input array.
 np.expand dims(a, axis) returns a new array with a new axis of size 1 inserted.
– a is the input array.
– axis is the position where the axis is to be inserted.
If axis is 0, this is equivalent to a[np.newaxis] and a[None]. If axis is 1, this is
equivalent to a[:, np.newaxis] and a[:, None].
(b)
[6 points] Image blurring:
Your task is to use NumPy to apply the following 3 × 3 blur filter to img with zero
padding so that img blur has the same shape as img.
blur_filter = np.array([[1/9, 1/9, 1/9],
[1/9, 1/9, 1/9],
[1/9, 1/9, 1/9]])
Implement the following function so that after the whole code snippet below is executed,
img blur stores the desired result.
def img_flatten_conv_1d(img, v):
# --- YOUR CODE HERE ---
return img_conv
v = blur_filter.sum(0) # ''sum(0)'' returns the sum of the array elements over axis 0.
img_blur = img_flatten_conv_1d(img, v)
img_blur = img_flatten_conv_1d(img_blur.T, v).T
Do NOT use any explicit loop in your code. You may find the following functions useful
for this question.
 np.convolve(a, v, mode=''full'') returns the discrete, linear convolution of two
one-dimensional sequences.
– a of shape (N, ): First one-dimensional input array (or array-like structure, e.g.,
list).
– v of shape (M, ): Second one-dimensional input array (or array-like structure,
e.g., list).
– mode (optional): The convolution mode. It must be one of ''full'', ''valid'',
''same''. Note: use ''valid'' for this question. When ''valid'' is used, np.convolve
is equivalent to the following function:
def convolve_valid(a, v):
if len(a) < len(v):
a, v = v, a
# swap the array if v is longer than a
c = np.zeros(len(a) - len(v) + 1)
for i in range(len(c)):
c[i] = np.sum(a[i:i+len(v)] * v[::-1])
return c
Examples:
>>> np.convolve([1,2,3],[0,1,0.5], ''valid'')
array([2.5])
# this is the output array
>>> np.convolve([1,2,3,4],[0,1,0.5], ''valid'')
array([2.5, 3.5])
# this is the output array
 np.zeros(shape, dtype=float) returns a new array of given shape and type, filled
with zeros.
– shape is the shape of the new array, e.g., (2, 3) or 2.
– dtype (optional) is the desired data type for the array, e.g., np.int8. The default
is np.float64.
 np.concatenate((a1, a2, ...), axis=0) joins a sequence of arrays along an ex-
isting axis.
– a1, a2, ... is a sequence of arrays (or array-like structure, e.g., list). The
arrays must have the same shape, except in the dimension corresponding to
axis (the first, by default).
– axis (optional) is the axis along which the arrays will be joined. If axis is None,
arrays are flattened before use. The default is 0.
 np.reshape(a, newshape) gives a new shape to an array without changing its data.
– a is the array to be reshaped.
– newshape is the new shape. It should be compatible with the original shape. If
an integer, then the result will be a 1D array of that length. One shape dimension
can be −1. In this case, the value is inferred from the length of the array and
the remaining dimensions (if any).
This is equivalent to a.reshape(newshape).
 np.transpose(a) returns the transpose of an array.
– a is the input array.
This is equivalent to a.T.
Hints:
(1) In this case, applying blur filter to the image can also be done by consecutively
applying two 1D filters, one vertically and the other horizontally, to the image.
(2) Since np.convolve only accepts 1D arrays, you may consider flattening the image
array, applying np.convolve to the flattened array, and then reshaping it back to a
2D array.
(3) Be aware of the boundaries since np.convolve with mode=''valid'' does not pad
the array and the output array does not always have the same shape as the input
array. Also, remember to remove the padding (if any) after convolutions.', 10, 3, NULL::jsonb, NULL, NULL, 'Problem 2 [10 points] Advanced Python: Image Processing with NumPy
Solution:
(a) def apply_circle_mask(img, center_x, center_y):
indices = np.arange(512)
x_dist = indices - center_x
y_dist = indices - center_y
dist = np.sqrt(x_dist ** 2 + y_dist[:, None] ** 2)
img_masked = img * (dist <= 100)
return img_masked
(b) def img_flatten_conv_1d(img, v):
zeros = np.zeros((512, 1))
img_padded = np.concatenate((zeros, img, zeros), axis=1)
img_flat = img_padded.reshape(-1)
img_flat_conv = np.convolve(img_flat, v, ''valid'')
img_flat_conv = np.concatenate(([0], img_flat_conv, [0]))
img_conv = img_flat_conv.reshape(512, 514)[:, 1:-1]
return img_conv
v = [1/3, 1/3, 1/3]
img_blur = img_flatten_conv_1d(img, v)
img_blur = img_flatten_conv_1d(img_blur.T, v).T
Scheme:
(a) If no explicit loop is used, 1 point for each of the following:
 correct broadcasting to create 2D distance array;
 correct distance values;
 correct mask (0.5 point for the opposite mask);
 correct final result.
If explicit loops are used, 1 point in total if the final result is correct.
(b) If no explicit loop is used, 1 point for each of the following:
 correct padding before flattening the image array (0.5 point if padding is applied
after flattening);
 correct flattening of the image array;
 correct dimensions of inputs (1D arrays) to np.convolve;
 correct values of inputs to np.convolve;
 correct padding after np.convolve;
 correct reshaping back to 2D image and removing padding (0.5 each).
If explicit loops are used, 1 point in total if the final result is correct.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [12 points] Na¨ıve Bayes, K-Nearest Neighbors and Perceptron
Below is some health data of patients with and without dengue fever.
Patient #
Diagnosis (B)
Body Temperature
(Celsius) (e1)
Pulse Rate
(bpm) (e2)
Dengue
Dengue
Dengue
No dengue
No dengue
36.5
No dengue
You can assume the data follows Gaussian distribution:
f(x) =
√
2πσ2 exp(−(x −µ)2
2σ2
)
where
µ = 1
n
n
X
i=1
xi
σ =
v
u
u
t
n −1
n
X
i=1
(xi −µ)2
(a)
[6 points] Using the data above, calculate the following. Round off calculations to the
fourth decimal place.
(i) P(e1 = 36 | B = No dengue)
(ii) P(e2 = 85 | B = No dengue)
(iii) P(e1 = 36 | B = No dengue) P(e2 = 85 | B = No dengue) P(B = No dengue)
(iv) Calculate the posterior probability that the belief is ‘No dengue’ for the sample with
evidence E = {e1 = 36, e2 = 85} if
P((e1 = 36, e2 = 85)|B = Dengue)P(B = Dengue) = 0.0000036
(b)
[4.5 points] Identify the reasons for inaccurate predictions when using the following
number of folds to evaluate the performance of a 3-Nearest Neighbors classifier model on
the given dataset (i.e., not including the sample introduced in part (a)) without shuffling.
(i) No. of folds = 6
(ii) No. of folds = 3
(iii) No. of folds = 2
(c)
[1.5 points] If the true label of the sample {e1 = 36, e2 = 85} is ‘No dengue’, will the
perceptron model make a good prediction for the sample? Provide explanations with
evidence for why or why not.', 12, 6, NULL::jsonb, NULL, NULL, 'Problem 3 [12 points] Na¨ıve Bayes, K-Nearest Neighbors and Perceptron
Solution:
(a)
(i)
p
2π(0.5)2 exp

−(36 −36.5)2
2(0.52)

= 0.4839
(ii)
p
2π(15)2 exp

−(85 −90)2
2(152)

= 0.0252
(iii)
(0.4839)(0.0252)(0.5) = 0.0061
(iv)
0.0061
0.0061 + 0.0000036 = 0.9994
(b)
(i) This will predict 5/6 correctly if uniform distance (prediction for Patient 5 is wrong).
Also, if inverse distance is used, the prediction for Patient 3 will also be wrong.
(ii) It is possible that the data will be folded in such a way that the class opposite to
the true label is typically the majority class e.g.
(Patient 1, Patient 2), (Patient 3,
Patient 4), (Patient 5, Patient 6) . In this case, most folds will have 50% accuracy.
The performance may be even worse as there is one pair of samples from opposite
classes with minimum pairwise Euclidean distance (Patient 3 and Patient 5).
(iii) This is not acceptable because it is possible that the training set for the KNN model
will have the class opposite to the true label as its majority class. In this case, the
prediction for all samples will be wrong.
(c) Yes, because in that case the data is linearly separable.
Scheme:
(a)
(i) 1.5 points for giving the correct answer.
(ii) 1.5 points for giving the correct answer.
(iii) 1.5 points for giving the correct answer.
(iv) 1.5 points for giving the correct answer.
(b)
(i) 1.5 point for giving the correct reason.
(ii) 1.5 point for giving the correct reason.
(iii) 1.5 point for giving the correct reason.
(c) 0.5 point for stating “Yes”, i.e., perceptron model will make a good prediction for the
sample. 1 point for giving the correct explanation.', ARRAY['Probabilistic Models', 'KNN and Clustering', 'Perceptron and MLP']::TEXT[], 'Probabilistic Models', NULL, ARRAY['Probabilistic Models', 'KNN and Clustering', 'Perceptron and MLP']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision', 'distance_calculation', 'algorithm_tracing', 'forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [11 points] Multi-layer Perceptron
(a)
[5 points] Amanda wants to train a multi-layer perceptron to classify various renting
options’ popularity.
Each housing option is represented with X, a three-dimensional
vector (x1, x2, x3).
x1 is the noise level rated from 1 to 5. x2 is the proximity to campus, measured in minutes
it takes to arrive at the north gate. x3 is the availability of food options rated on a scale
from 1 (no food available) to 5 (a wide variety of food available). Amanda has collected
the opinions of a group of students on their preferences, classifying the housing options
into three types: low, medium, and high, represented as one-hot vector Y = (y1, y2, y3).
When designing the network architecture, Amanda has two ideas.
Model A
Model B
Layer (type)
Output Shape
Layer (type)
Output Shape
dense 1 (Dense)
(None, 6)
dense 1 (Dense)
(None, 3)
dense 2 (Dense)
(None, 4)
dense 2 (Dense)
(None, 5)
dense 3 (Dense)
(None, 3)
dense 3 (Dense)
(None, 2)
dense 4 (Dense)
(None, 3)
(i) In a multi-layer perceptron, suppose there are L hidden layers, each layer k (starting
from 1) has lk hidden nodes. The input data is a n-dimensional vector, and the
output data is m-dimensional.
(I) Calculate the number of updated parameters in the MLP model. Represent your
result using n, m, L, and lk for k = 1, . . . , L.
(II) Apply your result in part (a)(i)(I) to Model A and Model B separately.
(ii) Please help Amanda decide on which multi-layer perceptron (A or B) to choose for
potentially better performance. Please briefly explain why.
(b) [1 point] Why do we use activation functions in multi-layer perceptron neural networks?
(c)
[2 points] Consider the two activation functions we have learned in class:
 Sigmoid activation function: σ(x) =
1+e−x .
 Binary step activation function:
f(x) =



if x ≤0
otherwise
The binary step activation function gives a hard threshold at 0, and its gradients are 0
almost everywhere. What is the problem if we use the binary step activation function in
a multi-layer perceptron network? If we want to avoid the problem while approximating
the binary step activation function, how to make use of the sigmoid activation function
to achieve that?
(d)
[3 points] You are given an multi-layer perceptron model with the architecture shown
below.
 ReLU: f(z) = max(0, z).
 Softmax: f(zi) =
ezi
Pn−1
j=0 ezj , where z = [z0, z1, . . . , zn−1].
(i) For a sample with features x1=1 and x2=1, what are the outputs of the hidden layer
and the output layer? If necessary, round off the values to two decimal places.
(ii) If the target labels have values Tk1=1,Tk2=0 for the sample in part d(i), calculate the
new values of the weights: w5, w7, and w1 after one round of backward propagation
if the learning rate is 0.4. Round off the values to four decimal places.
For reference, here are some of the equations used in the back propagation.
δk = (Ok −Tk)Ok(1 −Ok)
δj = Oj(1 −Oj)
X
k∈K
δkwjk
wjk ←wjk −ηδkOj
wij ←wij −ηδjOi', 11, 7, NULL::jsonb, NULL, NULL, 'Problem 4 [11 points] Multi-layer Perceptron
Solution:
(a)
(i) (I) For any hidden layer and the output layer, the parameters include the weight
and the bias.
(n + 1) × l1 +
L−1
X
k=1
lk+1 × (lk + 1) + m × (lL + 1)
(II) Model A: 67; Model B: 53.
(ii) Model A is better because it has more parameters and therefore more expressive.
(b) To add non-linearity to the neural network model so that it has more powerful modeling
capability.
(c) The problem is that we cannot learn the parameters using gradient descent since the
gradients are 0 almost everywhere. We can solve the problem while approximating a
hard threshold by scaling up the weights in a sigmoid activation function. For example,
σ(cx) is steeper than σ(x) and more similar with the binary step function, for c > 1.
(d)
(i) Hidden layer Oj1 = 0.2, Oj2 = 0
Output layer Ok1 = 0.48, Ok2 = 0.52
(ii) w
′
5 = w5 −δk1ηOj1 = 0.1 −(−1298)(0.4)(0.2) = 0.1104
w
′
7 = w7 −δk2ηOj1 = 0 −(1298)(0.4)(0.2) = −0.0104
δj1 = Oj1(1−Oj1)(δk1w5+δk2w7) = 0.2(1−0.2)(−0.1298(0.1)+0.1298(0)) = −0.0021
w
′
1 = w1 −δj1ηx1 = 0.2 −(0.4)(−0.0021)(1) = 0.2008
Scheme:
(a)
(i) (I) 1.5 points for giving the correct formula.
(II) 1 point for each correct answer. 2 points in total.
(ii) 0.5 point for stating Model A is better. 1 point for giving the correct explanation.
1.5 points in total.
(b) 1 point for giving the correct answer for why we use activation functions in multi-layer
perceptron.
(c) 1 point for stating the problem. 1 point for explaining how to make use of the sigmoid
function to avoid the problem. 2 points in total.
(d)
(i) 0.5 point for giving each correct output. 1.5 points in total.
(ii) 0.5 point for giving each correct weight value. 1.5 points in total.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [13 points] Digital Image Processing
(a) [4.5 points] The following is a grayscale image of the HKUST redbird and its correspond-
ing histogram. An grayscale image histogram is a distribution showing the frequency of
occurrence of each gray-level value.
After some transformation on the original image, we get the transformed images shown
below (first row). Their histograms are shuffled and shown in the rows below (second,
third, and fourth row). Please state which transformations are applied to get the resulting
images for (a),(b),(c), and what is the correct pairing for the images and histograms
between {a,b,c} and {d,e,f}. Please also briefly state why.
(b)
[3 points] Consider the following 3 × 3 image.
Perform binary thresholding of the
image using Otsu’s method. The initial threshold is T=100, and we apply one iteration.
What is the resulting threshold and the resulting image after thresholding? What is the
advantage of using Otsu’s Method for image thresholding compared to the regular image
thresholding algorithm?
(c)
[2 points] What is the resulting image of size 7 × 7 after adding reflection padding of
size 2 on the original image in part (b)?
(d)
[1.5 points] Please briefly explain the effect of convolving an image with the following
kernels.
(i) Kernel 1:
1/9
1/9
1/9
1/9
1/9
1/9
1/9
1/9
1/9
(ii) Kernel 2:
-1
-1
-1
(iii) Kernel 3:
-1
-1
-1
-1
-1
-1
-1
-1
(e)
[2 points] Is it possible to design a 3 × 3 kernel and apply convolution with the kernel
to flip a 64Ö64 image horizontally or vertically? If yes, please give such a kernel. If not,
please explain why.', 13, 9, NULL::jsonb, NULL, NULL, 'Problem 5 [13 points] Digital Image Processing
Solution:
(a) (b)-(f): image (horizontal) flipping because the histogram is the same as the original
image.
(c)-(d): binary thresholding because there are only two values 0 and 255 in the histogram
(a)-(e): contrast stretching because the intensity value in the histogram has been stretched
to a wider range.
(b) After the first iteration µ1=21 , µ2=128, T = 74.5 The resulting images are
Compared to regular image thresholding algorithms, Otsu’s method has advantages (i)
can automatically determine the threshold value T (ii) the resulting threshold value is
reproducible. Given the same image, two researchers using Otsu’s algorithm must arrive
at the same threshold.
(c) Resulting image:
(d) (i) smoothing, (ii) vertical edge detection, (iii) sharpening.
(e) No, because the image flipping is a global operation on the image, but convolution with
a 3 × 3 kernel is a local operation. Concretely, the 3 × 3 kernels can only capture the
input value of 3 × 3 neighbors, but the flipping requires the pixel value information at a
longer distance. The longest dependency distance can be 64.
Scheme:
(a) 1 for stating each transformation correctly. 3 points in total. 0.5 point for giving each
correct pairing. 1.5 points in total.
(b) 1 point for giving the correct resulting threshold. 1 point for giving the correct result
image. 1 point for stating the advantage of using Otsu’s method. 3 points in total.
(c) 0.05 for giving each correct value (40 values). 2 points in total.
(d) 0.5 point for stating each effect of convolving with the given kernel correctly. 1.5 points
in total.
(e) 0.5 point for stating it is impossible to design a 3×3 kernel and apply it to flip the 64×64
image. 1.5 points for giving the explanation.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [13 points] Dilated Convolution and Dropout
(a)
[10 points] Dilated convolution is a variation of the standard convolution operation that
involves skipping input elements by a certain dilation rate. By doing this, the convo-
lutional kernel can be made to “see” a larger area of the input data without actually
increasing the number of parameters it has. The dilation rate determines the spacing
between the values in the kernel.
The figure below demonstrates how a 3×3 kernel is applied to a 7×7 image using a
dilation factor of 2.
Given the incomplete implementation of the Python function dilated convolution that
takes the following inputs:
 input array: a 2D NumPy array representing the input data
 kernel: a 2D NumPy array representing the convolutional kernel
 dilation rate: an integer representing the dilation rate (default value is 1)
 stride: an integer representing the stride (default value is 1)
 padding: a string representing the padding type, either ‘valid’ (no padding) or ‘same’
(padding to preserve input dimensions) (default value is ‘valid’)
and returns a 2D NumPy array representing the output of the dilated convolution oper-
ation.
Complete the missing parts of the function using NumPy, without using any special-
ized deep learning libraries so that the execution of the test script produces the required
output. Make sure that your implementation supports stride and padding options.
You may find the following formula for determining the size of output image of reg-
ular image convolution useful for this question.
(Size of image dimension - Size of kernel dimension + 2 × Padding) / Stride + 1
import numpy as np
# Zero pad the input_array. For example
# a = [[1,2,3]]
# np.pad(a, ((1,2),(3,4)), ''constant'')
# >> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# >>
[0, 0, 0, 1, 2, 3, 0, 0, 0, 0],
# >>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# >>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
def dilated_convolution(input_array, kernel, dilation_rate=1,
stride=1, padding=''valid''):
# Apply padding to the input array if specified
if padding == ''same'':
pad = #______ TODO 1 ______
padded_input = np.pad(input_array, ((pad, pad), (pad, pad)), mode=''constant'')
else:
padded_input = input_array
# Calculate the output shape
output_rows = #______ TODO 2 ______
output_cols = #______ TODO 3 ______
kernel_rows, kernel_cols = kernel.shape
# Initialize the output array with zeros
output_array = np.zeros((output_rows, output_cols))
# Iterate through the kernel and perform the convolution
for i in range(kernel_rows):
for j in range(kernel_cols):
# Calculate the input indices for the current kernel position
input_row_indices = #______ TODO 4 ______
input_col_indices = #______ TODO 5 ______
# Perform the convolution and accumulate the results in the output array
output_array += #______ TODO 6 ______
return output_array
# Test script
input_array = np.array(np.arange(100).reshape(10,10))
kernel = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
dilation_rate = 2; stride = 2; padding = ''same''
output_array = dilated_convolution(input_array, kernel,
dilation_rate, stride, padding)
print(output_array)
# Output:
# [[ 22.
26.
30.
34.
8.]
#
[ 62.
66.
72.
78.
34.]
#
[102. 126. 132. 138.
74.]
#
[142. 186. 192. 198. 114.]
#
[ 80. 142. 146. 150. 154.]]
You may find the following functions useful for this question.
 np.arange(start, stop, step) returns spaced values with a given interval.
– start (optional) is the start of interval. The default start value is 0.
– stop is the end of interval. The returned interval does not include this value.
– step (optional) is the spacing between values. The default spacing is 1.
 np.array(object) creates an array.
– object is an array or any sequence.
 np.reshape(a, newshape) gives a new shape to an array without changing its data.
– a is the array to be reshaped.
– newshape is the new shape. It should be compatible with the original shape. If
an integer, then the result will be a 1D array of that length. One shape dimension
can be -1. In this case, the value is inferred from the length of the array and the
remaining dimensions (if any).
This is equivalent to a.reshape(newshape).
 ndarray.shape returns a tuple of array dimensions.
 ndarray.zeros(shape, dtype=float) returns a new array of given shape and type,
filled with zeros.
– shape is the shape of the new array, e.g., (2,3) or 2.
– dtype(optional) is the desired data type for the array, e.g., np.int8. The default
is np.float64.
 range(n) returns a numeric series starting with 0 and extending up to but not
including n.
(b) [3 points] You are given an incomplete implementation of the Python function dropout,
which implements the Dropout technique for regularization in neural networks.
The
function takes the following input:
 input array: a 2D NumPy array representing the input data
 p: a dropout probability
The function generates a binary mask with the same shape as the input, where each
element is 1 with probability 1-p and 0 with probability p. The input is then multiplied
element-wise by the mask, effectively dropping out random elements. The function re-
turns a NumPy array representing the output of the dropout operation.
Complete the missing part of the function using NumPy, without using any special-
ized deep learning libraries so that the execution of the test script produces the required
output.
You may find the following function useful for this question.
 np.random.rand(d0, d1, . . ., dn) returns an array of the given shape and popu-
late it with random samples from a uniform distribution over [0, 1).
– d0, d1,. . ., dn (optional) represent the dimensions of the returned array, must
be non-negative. If no argument is given, a single Python float is returned.
import numpy as np
def dropout(input_array, p):
mask = #______ TODO ______
return input_array * mask', 13, 12, NULL::jsonb, NULL, NULL, 'Problem 6 [13 points] Dilated Convolution and Dropout
Solution:
(a) Dilated Convolution
TODO #
Answer
((kernel.shape[0] - 1) * dilation_rate) // 2
1.5 points
(input_array.shape[0] - kernel.shape[0] * dilation_rate + 2 * pad) // stride + 1
1.5 points
(input_array.shape[1] - kernel.shape[1] * dilation_rate + 2 * pad) // stride + 1
1.5 points
np.arange(0, output_rows * stride, stride) + i * dilation_rate
1.5 points
np.arange(0, output_cols * stride, stride) + j * dilation_rate
1.5 points
kernel[i, j] * padded_input[np.ix_(input_row_indices, input_col_indices)]
2.5 points
(b) Dropout
(np.random.rand(input_array.shape[0],input_array.shape[1]) < p) / p
# 3 points', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [18 points] Convolutional Neural Network
(a)
[8 points] Consider the following Keras implementation of a Convolutional Neural Net-
work:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Flatten
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=(5, 5), padding=''same'',
activation=''relu'', input_shape=(32, 32, 3)))
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'',
activation=''relu''))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'',
activation=''relu'', strides=(2, 2)))
model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'',
activation=''relu''))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(units=128, activation=''relu''))
model.add(Dense(units=10, activation=''softmax''))
(i) What is the padding size (in terms of border width) of the first convolutional layer?
(ii) Fill in the blanks in the answer book in the model.summary() of the network. Assume
all convolutional layers have biases. Numbers shown as ? are kept secret and you
may ignore them.
Hint: When padding = “same” and strides = 2, the output has the half size of the
input.
(iii) Suppose you observe the following loss curves as you train the Convolutional Neural
Network. What is the most likely problem with the model? When does the problem
usually occur? Describe a common way to mitigate the problem and explain why it
works (in a few sentences).
(b) [5 points] What is the output of the following parts of this very tiny convolutional neural
network? This network was trained to classify whether the input 4×4 image depicts the
letter “L” or not. Stride is 1, padding size is 0 for all layers, and the pool size for the
max pooling layer is (2, 2). Show your calculation step by step. You only need to keep
1 decimal number.
 ReLU activation function f(z) = max(0, x).
 Sigmoid activation function f(z) =
1+e−z , where e = 2.718.
Note: Assume the given kernels have already been flipped.
(i) Feature map after Max-Pooling
(ii) Feature in the fully-connected layer
(iii) Output
(c) [5 points] In video processing, 3D convolution is particularly valuable due to its ability to
capture temporal dynamics as well as spatial features, which is essential for understanding
content across frames. Unlike 2D convolution, which only analyzes spatial patterns within
a single frame, 3D convolution extends the analysis across multiple consecutive frames,
treating time as the third dimension alongside height and width. This approach allows a
Convolutional Neural Network (CNN) to not only perceive static features in individual
frames but also to understand movements and changes over time, which are crucial for
tasks such as action recognition, video classification, and anomaly detection in video
streams.
Suppose we have video data of shape (32, 400, 300, 3), corresponding to (#keyframes,
width, height, #channels). In the first layer, we want to apply a 3D convolution with
100 kernels in the shape of (3, 3, 3), corresponding to (keyframe, width, height). The
padding is (0, 2, 2), and the stride is (1, 2, 2). Please answer the following question with
calculation steps or rationales.
(i) What is the shape of the output after this layer?
(ii) How many weight parameters are there?
(iii) How many bias parameters are there?', 18, 16, NULL::jsonb, NULL, NULL, 'Problem 7 [18 points] Convolutional Neural Network
Solution:
(a)
(i) 2
(ii) 16, 16, 64
8,
8,
(iii) The model has overfit the training dataset.
Overfitting occurs when a model is
too complex and learns the noise in the training data rather than the underlying
patterns.
Dropout can be used to mitigate the problem. Dropout is a technique where, during
training, some neurons in the network are randomly dropped out (i.e., their outputs
are set to zero) with a certain probability, typically 0.2 or 0.5. This means that,
at each training iteration, a different subset of neurons is randomly selected to be
“dropped out.”
Dropout helps prevent overfitting in several ways (full marks to any of the fol-
lowing):
 Reducing capacity: By randomly dropping out neurons, the network’s capacity
is reduced, making it less prone to overfitting. With fewer neurons, the network
has fewer opportunities to memorize the training data.
 Forcing feature sharing: Dropout encourages feature sharing among neurons.
When a neuron is dropped out, the network must rely on other neurons to make
predictions, which promotes feature sharing and reduces overfitting.
 Preventing complex co-adaptations: Dropout breaks the complex co-adaptations
between neurons, which can lead to overfitting. By randomly dropping out neu-
rons, the network is forced to learn simpler, more generalizable representations.
 Improving generalization: Dropout can be seen as a form of data augmentation.
By randomly dropping out neurons, the network is forced to generalize to new,
unseen situations, which improves its ability to generalize to new data.
 Reducing the risk of over-reliance on a single neuron: Dropout prevents the
network from relying too heavily on a single neuron or a small group of neurons.
This reduces the risk of overfitting, as the network is forced to use multiple
neurons to make predictions.
 Ensemble-like behavior: Dropout can be seen as an ensemble method, where
multiple sub-networks are trained simultaneously. Each sub-network is a different
subset of neurons, and the final prediction is an ensemble of these sub-networks.
This ensemble-like behavior improves generalization and reduces overfitting.
The answer is not unique.
(b) Answer:
(c)
(i) For any dimension (let’s denote it generically as D): Output D = floor((Input D +
2 × Padding - Kernel D ) / Stride) + 1. Therefore, we have:
Output keyframe = floor((32 + 2 * 0 - 3) / 1) + 1 = 30
Output width = floor((400 + 2 * 2 - 3) / 2) + 1 = 201
Output height = floor((300 + 2 * 2 - 3) / 2) + 1 = 151
The output shape is (30, 201, 151, 100).
(ii) There are 100 kernels in the shape of (3, 3, 3). Therefore, the number of weight
parameters is 1000 Ö 3 Ö 3 Ö 3 x 3 = 8100.
(iii) There is 1 bias parameter per kernel. So the total biases is 100.
Scheme:
(a)
(i) 1 point for giving the correct padding size.
(ii) 1 point for giving each correct shape (3 shape values). 1 point for giving the correct
number of parameters. 4 points in total.
(iii) 1 point for stating the model has overfit the training dataset. 1 point for explaining
what does the problem usually occur. 1 point for describing a way to mitigate the
problem. 3 points in total.
(b) 1 point for giving each correct feature map (2 feature maps). 1 point for giving each
feature in the fully-connected layer (2 features). 1 point for giving the correct output. 5
points in total.
(c)
(i) 2 points for giving the correct output shape after this layer.
(ii) 1.5 points for giving the correct number of weight parameters.
(iii) 1.5 points for giving the correct total number of biases.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'hard', '', '', ''),
    ('COMP2211-2024-spring-final', '8', NULL, 8, 'long_question', 'long_answer', 'Problem 8 [10 points] Minimax and Alpha-Beta Pruning
(a)
[5 points] The figure below shows a tree in a minimax game with two players.
A
B
C
D
E
F
G
H
I
J
MIN
MAX
Terminal
nodes
Score
(i) Calculate the best score for the non-terminal nodes with the minimax algorithm.
(ii) Suppose we are using an alpha-beta pruning algorithm. Indicate which edge will be
pruned. Gives the alpha-beta values when running. We denote the edge between
nodes A and B as AB.
(iii) How do you make a minimax algorithm to find the shortest path to victory? Explain
in one sentence.
(b) [5 points] Consider the non zero-sum generalization in which the sum of the two players’
utilities are not necessarily zero. Because player A’s utility no longer determines player
B’s utility exactly, the leaf utilities are written as pairs (UA, UB), with the first and second
component indicating the utility of that leaf to player A and player B respectively. In
this generalized setting, player A seeks to maximize UA, the first component, while player
B seeks to maximize UB, the second component.
(i) Complete the table in the answer book by estimating the value (as pairs) at each of
the internal node. Assume that each player maximizes their own utility.
(ii) Is alpha-beta pruning still applicable in this case? Briefly explain why and provide
an example. (Hint: you can think about the case where UA(s) = UB(s) for all nodes.)', 10, 18, NULL::jsonb, NULL, NULL, 'Problem 8 [10 points] Minimax and Alpha-Beta Pruning
Solution:
(a)
(i) Answer:
Nodes
Score
A
B
C
D
(ii) Answer:
Edge
Alpha
Beta
CH
DJ
(iii) Record depth information to distinguish paths.
(b)
(i) Answer:
A
(2,4)
B
(0,3)
C
(-1,3)
D
(1,1)
E
(0,-2)
(ii) No. The values that the first and second player are trying to maximize are inde-
pendent. Therefore, the principle for pruning in alpha-beta pruning—that a worse
outcome for one player implies a better outcome for the other—no longer applies.
For instance, in the case where UA(s) = UB(s) for all nodes, the problem reduces to
searching for the max-valued leaf, which could appear anywhere in the tree.
Scheme:
(a)
(i) 0.25 point for giving each correct numeric value. 1 point in total.
(ii) 0.5 point for giving each correct edge/numeric value. 3 points in total.
(iii) 1 point for giving a way to find the shortest path to victory.
(b)
(i) 0.5 point for giving each pair of values. 2.5 points in total.
(ii) 0.5 point for stating “No”. 1 point for explaining why and 1 point for giving an
example. 2.5 points in total.', ARRAY['Search and Games']::TEXT[], 'Search and Games', 'Search and Games', ARRAY['Search and Games']::TEXT[], ARRAY['tree_search', 'pruning', 'manual_tracing']::TEXT[], 'medium', '', '', ''),
    ('COMP2211-2024-spring-final', '9', NULL, 9, 'long_question', 'short_answer', 'Problem 9 [3 points] Ethics of Artificial Intelligence
From what areas we should ensure the AI models in production in their organizations func-
tion ethically?
-------------------- END OF PAPER
--------------------
/* Rough work */
/* Rough work */
/* Rough work */
/* Rough work */
/* Rough work */', 3, 19, NULL::jsonb, NULL, NULL, 'Problem 9 [3 points] Ethics of Artificial Intelligence
Solution:
 Data Ethics
 Fair AI model (or avoiding AI model bias)
 AI model monitoring and maintenance.
Scheme:
 1 point for giving each area correctly. 3 points in total.
-------------------- END OF PAPER
--------------------', ARRAY['Ethics of AI']::TEXT[], 'Ethics of AI', 'Ethics of AI', ARRAY['Ethics of AI']::TEXT[], ARRAY['concept_explanation', 'argumentation', 'comparison']::TEXT[], 'easy', '', '', '')
) AS seed(
  source_exam_key,
  question_number,
  parent_question,
  display_order,
  question_type,
  question_format,
  question_text,
  score,
  page_number,
  options,
  correct_option,
  correct_answer,
  raw_answer_text,
  topics,
  topic_primary,
  analytics_topic,
  topic_tags,
  skill_tags,
  difficulty,
  knowledge_reminder,
  ai_hint,
  solution
)
JOIN papers AS p
  ON p.source_exam_key = seed.source_exam_key
 AND p.source_kind = 'course_library'
WHERE NOT EXISTS (
  SELECT 1 FROM paper_questions q
  WHERE q.paper_id = p.id
    AND q.question_number = seed.question_number
);