-- ============================================ -- PastPaper Master — COMP2211 problem-level questions -- Seed Date: 2026-03-24 -- ============================================ -- -- Preconditions: -- 1. Run comp2211_course_library_papers.sql first. -- 2. Ensure those paper rows already exist in `papers`. -- -- This seed inserts one row per top-level Problem. -- It is intentionally coarse-grained and idempotent. INSERT INTO paper_questions ( paper_id, question_number, parent_question, display_order, question_type, question_format, question_text, score, page_number, options, correct_option, correct_answer, raw_answer_text, topics, topic_primary, analytics_topic, topic_tags, skill_tags, difficulty, knowledge_reminder, ai_hint, solution ) SELECT p.id, seed.question_number, seed.parent_question, seed.display_order, seed.question_type, seed.question_format, seed.question_text, seed.score, seed.page_number, seed.options, seed.correct_option, seed.correct_answer, seed.raw_answer_text, seed.topics, seed.topic_primary, seed.analytics_topic, seed.topic_tags, seed.skill_tags, seed.difficulty, seed.knowledge_reminder, seed.ai_hint, seed.solution FROM ( VALUES ('COMP2211-2022-fall-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [15 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 1.5 points for each correct answer. (a) Machine learning gives computers the ability to make decision by writing down rules and methods and being explicitly programmed. (b) The regression technique in machine learning is NOT a group of algorithms that are used for predicting a class/category. (c) A 5-fold cross validation for K-nearest neighbors algorithm means that for each value of K, we randomly select 1/5 of the training data as the validation set to evaluate the model which is trained by the remaining (4/5) of the training data. (d) If we use K-means clustering, we will get the same cluster assignments for each data point, whether or not we standardize the variables. (e) Given an input data point x, where all the attribute values in x are real numbers. Suppose you are asked to predict a label y for x, where y = 0 or y = 1. Assume you have no knowledge about the distributions of x and y, perceptron is an appropriate method for this problem. (f) A perceptron with the unit step function (i.e., f(z) = 0 if z ≤0, otherwise f(z) = 1) as the activation function cannot be used for multi-class classification. (g) If a training dataset is linearly separable into two classes, the perceptron learning rule will always converge to weights and bias that accomplish the desired classification. (h) The neural network weights are updated during forward propagation. (i) An advantage of gradient descent-based methods, such as back-propagation, is that they cannot get stuck in local minima. (j) The back-propagation algorithm, when run until a minimum is achieved, always finds the same solution (i.e., weights and biases) no matter what the initial set of weights and biases are. Question (a) (b) (c) (d) (e) (f) (g) (h) (i) (j) Answer', 15, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [15 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 1.5 points for each correct answer. (a) Machine learning gives computers the ability to make decision by writing down rules and methods and being explicitly programmed. (b) The regression technique in machine learning is NOT a group of algorithms that are used for predicting a class/category. (c) A 5-fold cross validation for K-nearest neighbors algorithm means that for each value of K, we randomly select 1/5 of the training data as the validation set to evaluate the model which is trained by the remaining (4/5) of the training data. (d) If we use K-means clustering, we will get the same cluster assignments for each data point, whether or not we standardize the variables. (e) Given an input data point x, where all the attribute values in x are real numbers. Suppose you are asked to predict a label y for x, where y = 0 or y = 1. Assume you have no knowledge about the distributions of x and y, perceptron is an appropriate method for this problem. (f) A perceptron with the unit step function (i.e., f(z) = 0 if z ≤0, otherwise f(z) = 1) as the activation function cannot be used for multi-class classification. (g) If a training dataset is linearly separable into two classes, the perceptron learning rule will always converge to weights and bias that accomplish the desired classification. (h) The neural network weights are updated during forward propagation. (i) An advantage of gradient descent-based methods, such as back-propagation, is that they cannot get stuck in local minima. (j) The back-propagation algorithm, when run until a minimum is achieved, always finds the same solution (i.e., weights and biases) no matter what the initial set of weights and biases are. Question (a) (b) (c) (d) (e) (f) (g) (h) (i) (j) Answer F T F F F T T F F F Marking scheme: ˆ 1.5 points for each correct answer. 15 points in total', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-fall-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [19 points] Python Fundamentals (a) [7 points] Consider the following NumPy arrays: import numpy as np A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) B = np.array([[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]) Write the output for each of the following Python statements. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) print(A[::3]) (ii) print(A[:-2:-2]) (iii) print(B[:, 1, 3:0:-1]) (iv) print(B[0, 1, [0,3]]) The NumPy array B is repeated here to ease your reading. B = np.array([[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]) (v) print(B[ B % 4 == 0 ]) (vi) # numpy.sum(a, axis) # returns the sum of array elements over a given axis print( np.sum( B, axis=1 ) ) (vii) # numpy.ndarray.reshape(shape) # returns an array containing the same data with a new shape print( B.reshape( (3, -1, 2) ) ) (b) [6 points] Write the output for the following Python code segments. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) import numpy as np A = np.array([[1, 2, 3], [2, 4, 6]]) B = np.array([1, 2, 3]) print(A * B) (ii) import numpy as np A = np.array([1, 2, 3, 4]) B = np.array([[1, 2], [2, 4], [3, 6], [4, 8]]) print(A + B) (iii) import numpy as np A = np.array([1, 2, 3, 4]) B = np.array([[2], [3], [4]]) print(A + B) (c) [6 points] The cosine similarity of two non-zero vectors, xTrain = (xTrain , . . . , xTrain n ) and xTest = (xTest , . . . , xTest n ), can be calculated by the following formula: Pn i=1 (xTrain i × xTest i ) qPn i=1 (xTrain i )2 qPn i=1 (xTest i )2 For example, if a training sample xTrain is (0, 1, 2) and a testing sample xTest is (4, 6, 8), the cosine similarity is: 0 × 4 + 1 × 6 + 2 × 8 √ 02 + 12 + 22√ 42 + 62 + 82 = 0.91350028 Given the following NumPy arrays, X_train and X_test, where each 1D array represents a data point: import numpy as np X_train = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]]) X_test = np.array([[4, 6, 8], [5, 0, 0]]) Compute the cosine similarity scores between each data point in X_train and each data point in X_test with a one-line Python expression, such that the evaluated result of the expression is: [[0.91350028 0. ] [1. 0.37139068] [0.99461155 0.45584231]] N ote: ˆ An expression is a combination of values, variables, operators, and calls to functions. ˆ Your expression should work with any number of data points in X_train and X_test and any number of values in the data points. ˆ You can assume that the number of attribute values in each data point is the same for both X_train and X_test. ˆ There must be no explicit loops in your expression. You may find the following attribute or functions useful for this question. ˆ Dot product of two arrays, a and b: numpy.dot(a, b) – It returns the product of matrix multiplication. ˆ Return the element-wise square of an array, x numpy.square(x) ˆ Return the sum of array elements over a given axis. numpy.sum(a, axis=None) – a: the input array with elements to sum. – axis: None or int or tuple of ints ˆ Return the non-negative square-root of an array, element-wise. numpy.sqrt(x) – x: the input array with values whose square-roots are required. ˆ Insert a new axis that will appear at the axis position in the expanded array shape. numpy.expand_dims(a, axis) – a: the input array. – axis: an int or tuple of ints that represents position in the expanded axes where the new axis (or axes) is placed. ˆ The transposed array. numpy.ndarray.T Write the one-line Python expression below: print( )', 19, 3, NULL::jsonb, NULL, NULL, 'Problem 2 [19 points] Python Fundamentals (a) [7 points] Consider the following NumPy arrays: Marking scheme: ˆ 1 point each sub-questions, i.e. (i) to (vii) ˆ No partial score import numpy as np A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) B = np.array([[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]) Write the output for each of the following Python statements. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) print(A[::3]) Answer: [1 4 7] (ii) print(A[:-2:-2]) Answer: [9] (iii) print(B[:, 1, 3:0:-1]) Answer: [[ 7 5] [19 18 17]] (iv) print(B[0, 1, [0,3]]) Answer: [4 7] (v) print(B[ B % 4 == 0 ]) Answer: [ 0 8 12 16 20] (vi) # numpy.sum(a, axis) # returns the sum of array elements over a given axis print( np.sum( B, axis=1 ) ) Answer: [[12 15 18 21] [48 51 54 57]] (vii) # numpy.ndarray.reshape(shape) # returns an array containing the same data with a new shape print( B.reshape( (3, -1, 2) ) ) Answer: [[[ 0 1] [ 2 3] [ 4 5] [ 6 7]] [[ 8 9] [10 11] [12 13] [14 15]] [[16 17] [18 19] [20 21] [22 23]]] (b) [6 points] Write the output for the following Python code segments. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. Marking scheme: ˆ 2 points each sub-questions, i.e. (i) to (iii) ˆ No partial score (i) import numpy as np A = np.array([[1, 2, 3], [2, 4, 6]]) B = np.array([1, 2, 3]) print(A * B) Answer: [[ 1 9] [ 2 8 18]] (ii) import numpy as np A = np.array([1, 2, 3, 4]) B = np.array([[1, 2], [2, 4], [3, 6], [4, 8]]) print(A + B) Answer: Error (iii) import numpy as np A = np.array([1, 2, 3, 4]) B = np.array([[2], [3], [4]]) print(A + B) Answer: [[3 4 5 6] [4 5 6 7] [5 6 7 8]] (c) [6 points] The cosine similarity of two non-zero vectors, xTrain = (xTrain , . . . , xTrain n ) and xTest = (xTest , . . . , xTest n ), can be calculated by the following formula: Pn i=1 (xTrain i × xTest i ) qPn i=1 (xTrain i )2 qPn i=1 (xTest i )2 For example, if a training sample xTrain is (0, 1, 2) and a testing sample xTest is (4, 6, 8), the cosine similarity is: 0 × 4 + 1 × 6 + 2 × 8 √ 02 + 12 + 22√ 42 + 62 + 82 = 0.91350028 Given the following NumPy arrays, X_train and X_test, where each 1D array represents a data point: import numpy as np X_train = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 6]]) X_test = np.array([[4, 6, 8], [5, 0, 0]]) Compute the cosine similarity scores between each data point in X_train and each data point in X_test with a one-line Python expression, such that the evaluated result of the expression is: [[0.91350028 0. ] [1. 0.37139068] [0.99461155 0.45584231]] N ote: ˆ An expression is a combination of values, variables, operators, and calls to functions. ˆ Your expression should work with any number of data points in X_train and X_test and any number of values in the data points. ˆ You can assume that the number of attribute values in each data point is the same for both X_train and X_test. ˆ There must be no explicit loops in your expression. You may find the following attribute or functions useful for this question. ˆ Dot product of two arrays, a and b: numpy.dot(a, b) – It returns the product of matrix multiplication. ˆ Return the element-wise square of an array, x numpy.square(x) ˆ Return the sum of array elements over a given axis. numpy.sum(a, axis=None) – a: the input array with elements to sum. – axis: None or int or tuple of ints ˆ Return the non-negative square-root of an array, element-wise. numpy.sqrt(x) – x: the input array with values whose square-roots are required. ˆ Insert a new axis that will appear at the axis position in the expanded array shape. numpy.expand_dims(a, axis) – a: the input array. – axis: an int or tuple of ints that represents position in the expanded axes where the new axis (or axes) is placed. ˆ The transposed array. numpy.ndarray.T Write the one-line Python expression below: print( ) Answer: print(X_train.dot(X_test.T) / np.dot(np.expand_dims(np.sqrt(np.sum(X_train**2, axis=1)), 1), np.expand_dims(np.sqrt(np.sum(X_test**2, axis=1)), 1).T) ) print(X_train.dot(X_test.T) / np.dot(np.expand_dims(np.sqrt(np.sum(X_train**2, axis=1)), 1), np.expand_dims(np.sqrt(np.sum(X_test**2, axis=1)), 0)) ) print(np.dot(X_train, X_test.transpose()) / (np.sqrt(np.sum(np.square(X_train),axis=1)[:, None]) * np.sqrt(np.sum(np.square(X_test), axis=1)[None, :])) ) print(np.dot(X_train, np.transpose(X_test)) / (np.sqrt(np.sum(np.square(X_train),axis=1)[:, np.newaxis]) * np.sqrt(np.sum(np.square(X_test), axis=1)[np.newaxis,:])) ) print(np.dot(X_train, X_test.T) / np.sqrt(np.sum(np.square(X_train), axis=1)).reshape(3,1).dot( np.sqrt(np.sum(np.square(X_test), axis=1)).reshape(1, 2)) ) Marking scheme: ˆ 2 points for correct numerator – If not using np.dot correctly, deduct 1 point (np.dot can be replaced by np.matmul, @); – If not using np.ndarray.T correctly, deduct 1 point (np.ndarray.T can be replaced by np.transpose); – The maximum number of points can be deducted for numerator is 2. ˆ 4 points for correct denominator – If not using np.expand dims correctly (including specifying axis), deduct 1 point for each (np.expand dims can be replaced by np.reshape or adding a new axis); – If not using np.dot correctly, deduct 1 point (np.dot can be replaced by np.matmul, @); – If not using np.sqrt correctly, deduct 1 point for each; – If not using np.sum correcly (including specifying axis), deduct 1 point for each; – If not using np.square correctly, deduct 1 point for each (np.square can be re- placed by np.ndarray ** 2); – The maximum number of points can be deducted for denominator is 4. If points to be deducted exceed 4, the denominator part gets 0 point.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''), ('COMP2211-2022-fall-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [18 points] Conditional Probability and Bayes Classifier (a) [4 points] Assume the probability of getting an A+ in COMP 2211 is 0.08. The prob- ability of getting a score greater than 90 in the midterm exam given that a student gets an A+ in COMP 2211 is 0.95, and the probability of getting a score greater than 90 in the midterm exam given that the student does not get an A+ in COMP 2211 is 0.05. (i) Calculate the probability of getting a score greater than 90 in the midterm exam. Show all your steps. Round your answer to 2 decimal places. The formula that you may find useful for this question: P(E) = P(E|B)P(B) + P(E|Not B)P(Not B) (ii) Use Bayes’ rule to calculate the probability of getting an A+ given that a student gets a score greater than 90 in the midterm exam. Show all your steps. Round your answer to 2 decimal places. The formula that you may find useful for this question: P(B|E) = P(B)P(E|B) P(E) (b) [8 points] Given the following dataset (X, y) with 10 training examples, each contains 2 attributes (x1, x2) and its binary label y. X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]] y = [0 0 1 1 1 1 0 0 1 1] Suppose you believe that a naive Bayes model would be appropriate for this dataset, and you want to classify the test sample: x = [1 1] (i) Compute the class prior probabilities, i.e., P(y = 0) and P(y = 1). (ii) Compute the 4 conditional probabilities required by naive Bayes for the test sample, i.e., P(x1 = 1|y = 0), P(x2 = 1|y = 0), P(x1 = 1|y = 1), P(x2 = 1|y = 1). (iii) Under the naive Bayes model and the probabilities you compute in parts b(i) and b(ii), what is the most likely label for the test sample, i.e., 0 or 1? If there is a tie, please include the word “tie” in your answer. Show all your steps. The formula that you may find useful for this question: BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) . . . P(ed|Bi)) (c) [3 points] Briefly describe a zero frequency problem of Naive Bayes classification and suggest a way to solve the problem. Also, state whether a zero frequency problem occurs in part (b). The dataset (X, y) is repeated here to ease your reading. X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]] y = [0 0 1 1 1 1 0 0 1 1] (d) [3 points] Consider the dataset given in part (b), i.e., the one above. Suppose you do NOT believe that the naive Bayes model would be appropriate for this dataset, and you want to classify the test sample: x = [0 0] using the Bayes model without making the naive assumption. What is the most likely label for the test sample? If there is a tie, please include the word “tie” in your answer. Show all your steps. The formula that you may find useful for this question: BNB = argmaxBiP(Bi)P((e1, e2, e3, . . . , ed)|Bi)', 18, 8, NULL::jsonb, NULL, NULL, 'Problem 3 [18 points] Conditional Probability and Bayes Classifier (a) [4 points] Assume the probability of getting an A+ in COMP 2211 is 0.08. The prob- ability of getting a score greater than 90 in the midterm exam given that a student gets an A+ in COMP 2211 is 0.95, and the probability of getting a score greater than 90 in the midterm exam given that the student does not get an A+ in COMP 2211 is 0.05. (i) [3 points] Calculate the probability of getting a score greater than 90 in the midterm exam. Show all your steps. Round your answer to 2 decimal places. The formula that you may find useful for this question: P(E) = P(E|B)P(B) + P(E|Not B)P(Not B) Answer: P(A+) =0.08 P(> 90|A+) =0.95 P(> 90|Not A+) =0.05 P(> 90) =P(> 90|A+)P(A+) + P(> 90|Not A+)P(Not A+) =0.95(0.08) + 0.05(0.92) = 0.12 Marking scheme: ˆ 0.5 point for P(A+) = 0.08 ˆ 0.5 point for P(> 90|A+) = 0.95 ˆ 0.5 point for P(> 90|Not A+) = 0.05 ˆ 0.5 point for P(Not A+) = 0.92 ˆ 1 point for P(> 90) = 0.12 (ii) [1 point] Use Bayes’ rule to calculate the probability of getting an A+ given that a student gets a score greater than 90 in the midterm exam. Show all your steps. Round your answer to 2 decimal places. The formula that you may find useful for this question: P(B|E) = P(B)P(E|B) P(E) Answer: P(A + | > 90) = P(A+)P(> 90|A+) P(> 90) = 0.08(0.95) 0.12 = 0.63 Marking scheme: ˆ 1 point for P(A + | > 90) = 0.63 (b) [8 points] Given the following dataset (X, y) with 10 training examples, each contains 2 attributes (x1, x2) and its binary label y. X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]] y = [0 0 1 1 1 1 0 0 1 1] Suppose you believe that a naive Bayes model would be appropriate for this dataset, and you want to classify the test sample: x = [1 1] (i) [1 point] Compute the class prior probabilities, i.e., P(y = 0) and P(y = 1). Answer: P(y = 0) = 4 10 = 2 P(y = 1) = 6 10 = 3 Marking scheme: ˆ 0.5 point for P(y = 0) = 2 ˆ 0.5 point for P(y = 1) = 3 (ii) [4 points] Compute the 4 conditional probabilities required by naive Bayes for the test sample, i.e., P(x1 = 1|y = 0), P(x2 = 1|y = 0), P(x1 = 1|y = 1), P(x2 = 1|y = 1). Answer: P(x1 = 1|y = 0) = 0 P(x2 = 1|y = 0) = 3 P(x1 = 1|y = 1) = 5 P(x2 = 1|y = 1) = 3 6 = 1 Marking scheme: ˆ 1 point for P(x1 = 1|y = 0) = 0 ˆ 1 point for P(x2 = 1|y = 0) = 3 ˆ 1 point for P(x1 = 1|y = 1) = 5 ˆ 1 point for P(x2 = 1|y = 1) = 1 (iii) [3 points] Under the naive Bayes model and the probabilities you compute in parts b(i) and b(ii), what is the most likely label for the test sample, i.e., 0 or 1? If there is a tie, please include the word “tie” in your answer. Show all your steps. The formula that you may find useful for this question: BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) . . . P(ed|Bi)) Answer: The numerator part of P(y = 0|x) = P(y = 0)P(x1 = 1|y = 0)P(x2 = 1|y = 0) = 0 The numerator part of P(y = 1|x) = P(y = 1)P(x1 = 1|y = 1)P(x2 = 1|y = 1) = 3 5  1  = 1 So, the most likely label for the test sample is 1. Marking scheme: ˆ 1 point for the numerator part of P(y = 0|x) = 0 ˆ 1 point for the numerator part of P(y = 1|x) = 1 ˆ 1 point for stating the most likely label for the test sample is 1 (c) [3 points] Briefly describe a zero frequency problem of Naive Bayes classification and suggest a way to solve the problem. Also, state whether a zero frequency problem occurs in part (b). Answer: If categorical variable has a category in test data set, which was not observed in the training data set, then the frequency-based probability estimate will be zero. And we will get a zero when all the probabilities are multiplied. This will be unable to make a prediction. This is known as zero frequency. A way to overcome this “zero frequency problem” is to add one to the count for ev- ery attribute value-class combination when an attribute value does not occur with every class value. A zero frequency problem occurs in part (b). Marking scheme: ˆ 1 point for briefly describing a zero frequency problem ˆ 1 point for suggesting a way to solve the problem *** Please note that suggesting “adding more training data/increase the dataset size/Using Log()” will not get any mark *** ˆ 1 point for stating a zero frequency problem occurs in part (b) The dataset (X, y) is repeated here to ease your reading. X = [[0 0], [0 1], [1 1], [1 0], [1 1], [1 0], [0 1], [0 1], [1 1], [0 0]] y = [0 0 1 1 1 1 0 0 1 1] (d) [3 points] Consider the dataset given in part (b), i.e., the one above. Suppose you do NOT believe that the naive Bayes model would be appropriate for this dataset, and you want to classify the test sample: x = [0 0] using the Bayes model without making the naive assumption. What is the most likely label for the test sample? If there is a tie, please include the word “tie” in your answer. Show all your steps. The formula that you may find useful for this question: BNB = argmaxBiP(Bi)P((e1, e2, e3, . . . , ed)|Bi) Answer: The numerator part of P(y = 0|x) = P(y = 0)P((x1 = 0, x2 = 0)|y = 0) = 2  1  = 1 The numerator part of P(y = 1|x) = P(y = 1)P((x1 = 0, x2 = 0)|y = 1) = 3  1  = 1 As the two probabilities are the same value, it’s a tie. Marking scheme: ˆ 1 point for the numerator part of P(y = 0|x) = 1 ˆ 1 point for the numerator part of P(y = 1|x) = 1 ˆ 1 point for stating it’s a tie', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'hard', '', '', ''), ('COMP2211-2022-fall-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors Consider a set of 8 training data given as (xTrain, cTrain) values, where xTrain is a string with 6 bits and cTrain is the multi-class label, A, B or C: { ("101110", A), ("110100", B), ("011100", B), ("000101", B), ("011111", B), ("101101", B), ("100011", A), ("010000", C) } Classify a test sample (xTest) with the string “101011” using a K-Nearest Neighbors classifier (KNN) and Hamming distance. Hamming distance is the number of bit positions in which the two bits are different. For example, the hamming distance between two strings, “11011001” and “10011101” is 2. (a) [5 points] Complete the following table by filling in the computed Hamming distance between each training data point and the test sample. Also, determine the class label of the test sample using KNN with K = 3 based on the results. xTrain cTrain Distance “101110” A “110100” B “011100” B “000101” B “011111” B “101101” B “100011” A “010000” C The predicted class label of the test sample is . (b) [3 points] What will happen if we use a KNN classifier with K = 4 instead to classify the test sample above? Describe your answer. (c) [3 points] A student claims that if a KNN classifier with K = 3 is used with the above training data, the predicted class label will never be class C no matter what the test example is, if there is no tie-breaking strategy used. Is this claim correct or not? Explain your answer. (d) [3 points] Is the above training data suitable for performing a D-fold cross-validation? Explain your answer.', 14, 12, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors Consider a set of 8 training data given as (xTrain, cTrain) values, where xTrain is a string with 6 bits and cTrain is the multi-class label, A, B or C: { ("101110", A), ("110100", B), ("011100", B), ("000101", B), ("011111", B), ("101101", B), ("100011", A), ("010000", C) } Classify a test sample (xTest) with the string “101011” using a K-Nearest Neighbors classifier (KNN) and Hamming distance. Hamming distance is the number of bit positions in which the two bits are different. For example, the hamming distance between two strings, “11011001” and “10011101” is 2. (a) [5 points] Complete the following table by filling in the computed Hamming distance between each training data point and the test sample. Also, determine the class label of the test sample using KNN with K = 3 based on the results. xTrain cTrain Distance “101110” A “110100” B “011100” B “000101” B “011111” B “101101” B “100011” A “010000” C The predicted class label of the test sample is . Answer: xTrain cTrain Distance “101110” A “110100” B “011100” B “000101” B “011111” B “101101” B “100011” A “010000” C The predicted class label of the test sample is A. Marking scheme: ˆ 0.5 point for each distance, i.e. 4 points total ˆ 1 point for the predicted class label (b) [3 points] What will happen if we use a KNN classifier with K = 4 instead to classify the test sample above? Describe your answer. Answer: We will have a tie where 2 nearest neighbors are with class label A and 2 nearest neighbors are with class label B. Marking scheme: ˆ 1 point for mentioning “Tie” or ”cannot find the class label” ˆ 2 points for the description of “2 NNs are class label A and 2 NNs are class label B” (c) [3 points] A student claims that if a KNN classifier with K = 3 is used with the above training data, the predicted class label will never be class C no matter what the test example is, if there is no tie-breaking strategy used. Is this claim correct or not? Explain your answer. Answer: The claim is correct. If k = 3 and assume equal weights for all the points, no mat- ter what the testing example is, it will be one of the following 3 cases: ˆ none of the 3 nearest neighbors is with class label C, so the predicted label will not be C. ˆ 1 of the 3 nearest neighbors is with class label C and the other 2 neighbors are both with class label A, or both with class label B, so the predicted class label will only be A or B. ˆ A tie Marking scheme: ˆ 1 point for “correct” or “yes” ˆ 2 points for the explanation (d) [3 points] Is the above training data suitable for performing a D-fold cross-validation? Explain your answer. Answer: No, unless the number of folds is equal to the number of training data points. The above training data set is umbalanced. Hence, if we split the above training data set, the data used for training may not have samples with class labels A or C. Marking scheme: ˆ 1 point for “no” ˆ 2 points for the explanation', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-fall-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [12 points] K-Means Clustering Consider the following training dataset: Example A B C D E Attribute value 0.2 0.5 0.7 2.5 3.5 Apply the K-means clustering algorithm to this dataset for K=2, i.e., you will produce two data clusters. Assume the distance measure you use is Euclidean distance, defined as follows. distance(xTrain, xTest) = v u u t n X i=1 (xTrain i −xTest i )2 (a) [1.5 points] Assume examples A and B are chosen as the initial centroid of clusters 1 and cluster 2, respectively. Write down the resulting cluster assignments in the table below. The cluster assignment for A and B has been done for you. Example A B C D E Cluster Assignment (b) [2 points] After assigning the examples to the clusters in part (a), re-compute the cluster centroids to be the mean of the examples currently assigned to each cluster. For each cluster, write the new cluster centroid in the table below. Cluster Centroid (c) [2.5 points] After recomputing the cluster centroids in part (b), re-assign the examples to the clusters to which they are closest. Write down the resulting cluster assignments in the table below. Example A B C D E Cluster Assignment (d) [2 points] After assigning the examples to the clusters in part (c), re-compute the cluster centroids to be the mean of the examples currently assigned to each cluster. For each cluster, write the new cluster centroid in the table below. If your answer is a floating point number, round it to 2 decimal places. Cluster Centroid (e) [2.5 points] After recomputing the cluster centroids in part (d), re-assign the examples to the clusters to which they are closest. Write down the resulting cluster assignments in the table below. Example A B C D E Cluster Assignment (f) [1.5 points] State whether the K-means clustering algorithm converges for this dataset after performing all the steps in parts (a)-(e). Explain your answer.', 12, 14, NULL::jsonb, NULL, NULL, 'Problem 5 [12 points] K-Means Clustering Consider the following training dataset: Example A B C D E Attribute value 0.2 0.5 0.7 2.5 3.5 Apply the K-means clustering algorithm to this dataset for K=2, i.e., you will produce two data clusters. Assume the distance measure you use is Euclidean distance, defined as follows. distance(xTrain, xTest) = v u u t n X i=1 (xTrain i −xTest i )2 (a) [1.5 points] Assume examples A and B are chosen as the initial centroid of clusters 1 and cluster 2, respectively. Write down the resulting cluster assignments in the table below. The cluster assignment for A and B has been done for you. Example A B C D E Cluster Assignment Marking scheme: ˆ 0.5 point for each correct cluster assignment. 1.5 points in total (b) [2 points] After assigning the examples to the clusters in part (a), re-compute the cluster centroids to be the mean of the examples currently assigned to each cluster. For each cluster, write the new cluster centroid in the table below. Cluster Centroid 0.2 1.8 Marking scheme: ˆ 0.5 point for each correct centroid. 1 point in total (c) [2.5 points] After recomputing the cluster centroids in part (b), re-assign the examples to the clusters to which they are closest. Write down the resulting cluster assignments in the table below. Example A B C D E Cluster Assignment Marking scheme: ˆ 0.5 point for each correct cluster assignment. 2.5 points in total (d) [2 points] After assigning the examples to the clusters in part (c), re-compute the cluster centroids to be the mean of the examples currently assigned to each cluster. For each cluster, write the new cluster centroid in the table below. If your answer is a floating point number, round it to 2 decimal places. Cluster Centroid 0.47 Marking scheme: ˆ 1 point for each correct centroid. 2 points in total (e) [2.5 points] After recomputing the cluster centroids in part (d), re-assign the examples to the clusters to which they are closest. Write down the resulting cluster assignments in the table below. Example A B C D E Cluster Assignment Marking scheme: ˆ 0.5 point for each correct cluster assignment. 2.5 points in total (f) [1.5 points] State whether the K-means clustering algorithm converges for this dataset after performing all the steps in parts (a)-(e). Explain your answer. Answer: The algorithm converges for this dataset since the cluster assignment of examples re- mains the same. Marking scheme: ˆ 0.5 point for stating the algorithm converges ˆ 1 point for the correct explanation', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-fall-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [14 points] Perceptron Suppose you are a CSE professor at HKUST who wants to offer a new course, COMP 2211. Before starting, you want to predict if COMP 2211 will be successful or not. You approach two students, A and B, asking them to read the proposal of 5 existing courses, COMP 1111, COMP 2222, COMP 3333, COMP 4444, and COMP 5555, and rate them on a scale of 1 to 5 (assume only integer scores). Assume each student reads the proposals independently and announces their rating. As each student might be biased, you may be unable to average their ratings. Instead, you decide to use a perceptron to classify the data using the training dataset in Table 1. Course Code x1 x2 Success COMP 1111 No COMP 2222 Yes COMP 3333 No COMP 4444 Yes COMP 5555 Yes Table 1: Training dataset The training dataset is composed of the two student scores, x1 = score given by student A, and x2 = score given by student B, and the corresponding true label regarding the success of the courses judged based on the SFQ, for the 5 courses. (a) [8 points] Train the perceptron to generate O = 1 if the course is success, O = 0 otherwise, using the initial weight of w1 = 0, w2 = 0, θ = -1, and the activation function: f(z) =    if z ≤0 otherwise Also, use a learning rate of η = 1. You may find the updating rules below useful. ∆wi = η(T −O)xi ∆θ = η(T −O) wi = wi + ∆wi θ = θ + ∆θ Present each row of Table 1 as a training example, and update the perceptron weights and bias before moving on to the next row. Show all the steps by completing Table 2. x1 x2 T O ∆w1 w1 ∆w2 w2 ∆θ θ - - - - - - - -1 Table 2: Perceptron algorithm execution (b) [6 points] Now, assume you do not care about the true label regarding the success of the courses judged based on the SFQ. State whether each of the following scenarios for which a perceptron using the ratings of the two students can correctly classify the data. Justify your answer. (i) If the total of their ratings is more than 8, then the course will be success and otherwise it will fail, i.e., Course Code x1 x2 Success COMP 1111 No COMP 2222 No COMP 3333 Yes COMP 4444 No COMP 5555 No (ii) The course will succeed if and only if each reviewer gives either a rating of 2 or a rating of 3, i.e., Course Code x1 x2 Success COMP 1111 No COMP 2222 Yes COMP 3333 No COMP 4444 No COMP 5555 Yes', 14, 16, NULL::jsonb, NULL, NULL, 'Problem 6 [14 points] Perceptron Suppose you are a CSE professor at HKUST who wants to offer a new course, COMP 2211. Before starting, you want to predict if COMP 2211 will be successful or not. You approach two students, A and B, asking them to read the proposal of 5 existing courses, COMP 1111, COMP 2222, COMP 3333, COMP 4444, and COMP 5555, and rate them on a scale of 1 to 5 (assume only integer scores). Assume each student reads the proposals independently and announces their rating. As each student might be biased, you may be unable to average their ratings. Instead, you decide to use a perceptron to classify the data using the training dataset in Table 1. Course Code x1 x2 Success COMP 1111 No COMP 2222 Yes COMP 3333 No COMP 4444 Yes COMP 5555 Yes Table 1: Training dataset The training dataset is composed of the two student scores, x1 = score given by student A, and x2 = score given by student B, and the corresponding true label regarding the success of the courses judged based on the SFQ, for the 5 courses. (a) [8 points] Train the perceptron to generate O = 1 if the course is success, O = 0 otherwise, using the initial weight of w1 = 0, w2 = 0, θ = -1, and the activation function: f(z) =    if z ≤0 otherwise Also, use a learning rate of η = 1. You may find the updating rules below useful. ∆wi = η(T −O)xi ∆θ = η(T −O) wi = wi + ∆wi θ = θ + ∆θ Present each row of Table 1 as a training example, and update the perceptron weights and bias before moving on to the next row. Show all the steps by completing Table 2. x1 x2 T O ∆w1 w1 ∆w2 w2 ∆θ θ - - - - - - - -1 -1 -4 -1 -5 -3 -1 -1 Table 2: Perceptron algorithm execution Marking scheme: ˆ 0.2 point for each correct value. 40 values, 8 points in total (b) [6 points] Now, assume you do not care about the true label regarding the success of the courses judged based on the SFQ. State whether each of the following scenarios for which a perceptron using the ratings of the two students can correctly classify the data. Justify your answer. (i) [3 points] If the total of their ratings is more than 8, then the course will be success and otherwise it will fail, i.e., Course Code x1 x2 Success COMP 1111 No COMP 2222 No COMP 3333 Yes COMP 4444 No COMP 5555 No Answer: Perceptron can correctly classify the data in this scenario, because the data are linearly separable. Marking scheme: ˆ 1 point for stating the perceptron can correctly classify the data ˆ 2 points for the explanation *** 2 points for the explanation part if correct weights and bias are given. *** (ii) [3 points] The course will succeed if and only if each reviewer gives either a rating of 2 or a rating of 3, i.e., Course Code x1 x2 Success COMP 1111 No COMP 2222 Yes COMP 3333 No COMP 4444 No COMP 5555 Yes Answer: Perceptron cannot correctly classify the data in this scenario, because the data are not linearly separable. Marking scheme: ˆ 1 point for stating the perceptron cannot correctly classify the data ˆ 2 points for the explanation', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-fall-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [8 points] Multilayer Perceptron Given the following training dataset with 4 examples in which each example has 3 binary input attributes, x1, x2, x3 and 1 binary output T. x1 x2 x3 T Suppose we are learning a Multilayer Perceptron with the training dataset above; the neural network has 1 hidden layer of 2 neurons, answer the following questions. (a) [2 points] How many weights and how many biases are there in the neural network? (b) [6 points] Draw the neural network below. -------------------- END OF PAPER -------------------- /* Rough work */ /* Rough work */', 8, 18, NULL::jsonb, NULL, NULL, 'Problem 7 [8 points] Multilayer Perceptron Given the following training dataset with 4 examples in which each example has 3 binary input attributes, x1, x2, x3 and 1 binary output T. x1 x2 x3 T Suppose we are learning a Multilayer Perceptron with the training dataset above; the neural network has 1 hidden layer of 2 neurons, answer the following questions. (a) [2 points] How many weights and how many biases are there in the neural network? Answer: 8 weights and 3 biases Marking scheme: ˆ 1 point for the number of weights ˆ 1 point for the number of biases *** 1 point if students only put 11 as the sum of these variables (e.g., 8 + 3 = 11) and do not indicate 8 for weights and 3 for biases. *** (b) [6 points] Draw the neural network below. Answer: Marking scheme: ˆ 3 points for indicating 3 input nodes, 2 hidden neurons, and 1 output neuron. -0.5 point for losing 1 node/neuron. ˆ 1 point for indicating 8 weights. -0.5 point for losing 1 weight indicator, at most -1. ˆ 1 point for indicating 3 biases. -0.5 point for losing 1 bias indicator, at most -1. ˆ 1 point for lines that connected layers. -9,5 point for losing 1 connecting line, at most -1. *** Additionally, suppose the number of nodes/neurons, the number of weights, the number of biases, and the number of lines are more than expected. In that case, we will only consider those correct ones, and the layer containing additional nodes/neurons will be regarded as wrong. (e.g. if you put 3-¿4-¿1 as network, then only the input nodes and output neuron consider to be correct, while all lines, weights, and biases should be wrong, you can get 2 points.) *** -------------------- END OF PAPER --------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'easy', '', '', ''), ('COMP2211-2022-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [15 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 1.5 points for each correct answer. (a) Machine learning is a sub-field of artificial intelligence. (b) Tensorflow is easy to use and less flexible than Keras. (c) Suppose we wish to calculate P(B|e1, e2) and we have no conditional independence in- formation. Having P(e1, e2), P(B), P(e1, e2|B) are sufficient for the calculation. (d) Training a K-nearest neighbors classifier takes more computational time than applying it. (e) K-nearest neighbors cannot be used for regression. (f) Consider D-fold cross-validation. A higher number of folds (i.e. larger value of D), the estimated error will be lower on average. (g) K-means clustering algorithm is guaranteed to converge. (h) Consider a two-class classification problem. Suppose we have trained a perceptron model on a linearly separable training set, and now we get a new labeled data point which is correctly classified by the model, and far away from the decision boundary. If we add this new point to our earlier training set and re-train with the same set of initial weights and bias, the learnt decision boundary will be changed for sure. (i) Gradient descent is usually NOT guaranteed to converge at global minimum. (j) If the learning rate is too small, gradient descent may take a very long time to converge and computationally expensive. Question Answer (T/F) (a) (b) (c) (d) (e) (f) (g) (h) (i) (j)', 15, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [15 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 1.5 points for each correct answer. (a) Machine learning is a sub-field of artificial intelligence. (b) Tensorflow is easy to use and less flexible than Keras. (c) Suppose we wish to calculate P(B|e1, e2) and we have no conditional independence in- formation. Having P(e1, e2), P(B), P(e1, e2|B) are sufficient for the calculation. (d) Training a K-nearest neighbors classifier takes more computational time than applying it. (e) K-nearest neighbors cannot be used for regression. (f) Consider D-fold cross-validation. A higher number of folds (i.e. larger value of D), the estimated error will be lower on average. (g) K-means clustering algorithm is guaranteed to converge. (h) Consider a two-class classification problem. Suppose we have trained a perceptron model on a linearly separable training set, and now we get a new labeled data point which is correctly classified by the model, and far away from the decision boundary. If we add this new point to our earlier training set and re-train with the same set of initial weights and bias, the learnt decision boundary will be changed for sure. (i) Gradient descent is usually NOT guaranteed to converge at global minimum. (j) If the learning rate is too small, gradient descent may take a very long time to converge and computationally expensive. Question Answer (T/F) (a) T (b) F (c) T (d) F (e) F (f) T (g) T (h) F (i) T (j) T Marking scheme: ˆ 1.5 points for each correct answer.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [16 points] Python Fundamentals For each of the Python expressions below, write the output when the expression is evaluated. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (a) [5 points] Consider the following NumPy arrays: import numpy as np A = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]) B = np.array([ [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15] ]) (i) print(A[2:6:3]) (ii) print(B[0:2, 1:3]) (iii) print(A[-1:-3]) (iv) print(B[::-1]) (v) print(B[:3,2:]) (b) [2 points] What is the output of the following code? import numpy as np X = np.array([2,2,0,2,2,0,1,1,0,1,1,0]) Y = X[X != 0] print(Y[::2]) (c) [9 points] Given the following code which computes the distance between each training point in X train and each test point in X test using a nested loop over both the training data and test data. import numpy as np def compute_distances_nested_loops(X_train, X_test): num_test = X_test.shape[0] num_train = X_train.shape[0] distances = np.zeros((num_test, num_train)) # --- BLOCK TO REWRITE --- for i in range(num_test): for j in range(num_train): distances[i,j] = np.sqrt(np.sum(np.square(X_test[i,:]-X_train[j,:]))) # --- BLOCK TO REWRITE --- return distances Train = np.array([[0,1], [1,2], [2,3], [3,4]]) Test = np.array([[5,6], [7,8]]) print(compute_distances_nested_loops(Train, Test)) # Output: # [[7.07106781 5.65685425 4.24264069 2.82842712] # [9.89949494 8.48528137 7.07106781 5.65685425]] Rewrite the block of code between the comment lines # --- BLOCK TO REWRITE --- using no explicit loops in the space provided. Hints: ˆ To compute the distance between training data point (0, 1) and test data point (5, 6), we do dist = (0 −5)2 + (1 −6)2 ˆ Expand it dist = 02 −2(0)(5) + 52 + 12 −2(1)(6) + 62 = 02 + 12 + 52 + 62 −2(0)(5) −2(1)(6) = 02 + 12 + 52 + 62 −2((0)(5) + (1)(6)) You may find the following functions useful for this question. ˆ Dot product of two arrays: numpy.dot(a, b) – It returns the product of matrix multiplication. ˆ Return the element-wise square of the input numpy.square(x) – x is the input data ˆ Return the sum of array elements over a given axis. numpy.sum(a, axis=None) – a is an array with elements to sum. – axis: None or int or tuple of ints. axis = 0 means along the column and axis = 1 means working along ther row. ˆ Return the non-negative square-root of an array, element-wise. numpy.sqrt(x) – x is the array with values whose square-roots are required. ˆ Return a matrix (or a 2D array) from an 1D array. numpy.matrix(data) – data is the 1D array. – Example: numpy.matrix([1, 2, 3]), output is [[1, 2, 3]] ˆ Insert a new axis that will appear at the axis position in the expanded array shape. numpy.expand_dims(a,axis) – a is the input array. – axis is an int or tuple of ints that represents poisition in the expanded axes where the new axis (or axes) is placed.', 16, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [16 points] Python Fundamentals For each of the Python expressions below, write the output when the expression is evaluated. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (a) [5 points] Consider the following NumPy arrays: import numpy as np A = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]) B = np.array([ [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15] ]) (i) print(A[2:6:3]) Answer: [30 60] # 1 point (ii) print(B[0:2, 1:3]) Answer: [[1 2] [5 6]] # 1 point (iii) print(A[-1:-3]) Answer: Empty Array # 1 point (iv) print(B[::-1]) Answer: [[12 13 14 15] # 1 point [ 8 9 10 11] [ 4 7] [ 0 3]] (v) print(B[:3,2:]) Answer: [[ 2 3] # 1 point [ 6 7] [10 11]] Remark: ˆ 0.5 point will be deducted if the answers are in raw number format without [](i.e. 1 2 5 6). Because in this case, the answer is no longer in array type, and we can- not distinguish whether the answer is a 2d array or 1d array. This deduction only performs once for a(i) and a(ii). (b) [2 points] What is the output of the following code? import numpy as np X = np.array([2,2,0,2,2,0,1,1,0,1,1,0]) Y = X[X != 0] print(Y[::2]) Answer: [2 2 1 1] # 2 points Remark: ˆ 0.5 point will be deducted if the answers are in raw number format without [](i.e. 1 2 5 6). Because in this case, the answer is no longer in array type, and we can- not distinguish whether the answer is a 2d array or 1d array. This deduction only performs once for for a(i) and a(ii). (c) [9 points] Given the following code which computes the distance between each training point in X train and each test point in X test using a nested loop over both the training data and test data. import numpy as np def compute_distances_nested_loops(X_train, X_test): num_test = X_test.shape[0] num_train = X_train.shape[0] distances = np.zeros((num_test, num_train)) # --- BLOCK TO REWRITE --- for i in range(num_test): for j in range(num_train): distances[i,j] = np.sqrt(np.sum(np.square(X_test[i,:]-X_train[j,:]))) # --- BLOCK TO REWRITE --- return distances Train = np.array([[0,1], [1,2], [2,3], [3,4]]) Test = np.array([[5,6], [7,8]]) print(compute_distances_nested_loops(Train, Test)) # Output: # [[7.07106781 5.65685425 4.24264069 2.82842712] # [9.89949494 8.48528137 7.07106781 5.65685425]] Rewrite the block of code between the comment lines # --- BLOCK TO REWRITE --- using no explicit loops in the space provided. Hints: ˆ To compute the distance between training data point (0, 1) and test data point (5, 6), we do dist = (0 −5)2 + (1 −6)2 ˆ Expand it dist = 02 −2(0)(5) + 52 + 12 −2(1)(6) + 62 = 02 + 12 + 52 + 62 −2(0)(5) −2(1)(6) = 02 + 12 + 52 + 62 −2((0)(5) + (1)(6)) You may find the following functions useful for this question. ˆ Dot product of two arrays: numpy.dot(a, b) – It returns the product of matrix multiplication. ˆ Return the element-wise square of the input numpy.square(x) – x is the input data ˆ Return the sum of array elements over a given axis. numpy.sum(a, axis=None) – a is an array with elements to sum. – axis: None or int or tuple of ints. axis = 0 means along the column and axis = 1 means working along ther row. ˆ Return the non-negative square-root of an array, element-wise. numpy.sqrt(x) – x is the array with values whose square-roots are required. ˆ Return a matrix (or a 2D array) from an 1D array. numpy.matrix(data) – data is the 1D array. – Example: numpy.matrix([1, 2, 3]), output is [[1, 2, 3]] ˆ Insert a new axis that will appear at the axis position in the expanded array shape. numpy.expand_dims(a,axis) – a is the input array. – axis is an int or tuple of ints that represents poisition in the expanded axes where the new axis (or axes) is placed. Answer: M = np.dot(X_test, X_train.T) te = np.square(X_test).sum(axis=1) tr = np.square(X_train).sum(axis=1) dists = np.sqrt(-2*M + np.matrix(tr) + np.matrix(te).T) Alternative answer: distances = np.sqrt(np.sum(np.square(np.expand_dims(X_test, 1) - X_train), axis=2)) distances = np.sqrt(np.sum(np.square(X_test[:,None] - X_train[None]), axis=-1)) Marking scheme: ˆ Case 0: Any for loop, list comprehension. # 0 point ˆ Case 1: (a) M = np.dot(X_test, X_train.T) # 2 points 1 point if missing transpose (b) te = np.square(X_test).sum(axis=1) # 2 points tr = np.square(X_train).sum(axis=1) # 2 points – each square # 1 point – each sum with right axis # 1 point (c) dists = np.sqrt(-2*M + np.matrix(tr) + np.matrix(te).T) # 3 points – 1 point if shape doesn’t match. – 2 points if np.matrix is missing, or attempt to modify the shape but wrong – if np.sqrt is missing, * if previous part isn’t calculated correctly, 0 point for this part. * -1 point otherwise. ˆ Case 2: distances = np.sqrt(np.sum(np.square(np.expand_dims(X_test, 1) - X_train), axis=2)) # 4 points (a) 4 points for the subtraction – 2 points if the shape modification is wrong. – 0 points if no shape modification at all. (b) 2 points for square (c) 2 points for sum; 1 point if axis is missing or wrong (d) 1 point for sqrt (unlike Case 1, sqrt is worth 1 point here.) (e) -3 points if a2–b2 is used. Remarks: (a) If reshaped is used to change the shape of the solution, -1 point. (b) If the program assume there’s 4 train points or 2 test points, and the number of dimension is 2, -1 point or 0 point for the whole question. (c) If Train, Test used instead of Xtrain, Xtest, -1 point. If your program are terribly wrong (< 3 points) anyway, this mark wouldn’t be deducted. (d) If extra code which leads to wrong answer, -1 point. (e) If the output shape is (numtrain, numtest) instead of (numtest, numtrain), -1 point. (f) If expand dims/reshape used without reassigning the variable, i.e. a = a.reshape(*b.shape), -1 point. (g) If missing shape, (e.g. Xtrain[0] instead of Xtrain.shape[0]), – 0.5 point. (h) Wrong spelling and syntax will not resulted in mark deduction, but 0.5 point will be deducted if you got full points. (i) Index using i j assuming the existence of for loop gets 0 point.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''), ('COMP2211-2022-spring-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [11 points] Conditional Probability and Bayes Classifier (a) [2 points] Given the following probabilities: ˆ P(Good course | Desmond is in the course) = 0.5 ˆ P(Good course | Desmond is not in the course) = 0.3 ˆ P(Desmond in a randomly chosen course) = 0.1 What is P(Desmond is in the course | Not a good course)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and de- nominator), e.g. 1/2. (b) [7 points] Suppose you are given the following set of data with three Boolean input variables A, B, C, and a single Boolean output variable D. A B C D (i) According to the Naive Bayes classifier, what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. (ii) According to the Naive Bayes classifier, what is P(D = 0|A = 1, B = 1)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. (iii) According to the general Bayes classifier (i.e. without independence assumption), what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. (iv) According to the general Bayes classifier (i.e. without independence assumption), what is P(D = 1|A = 1, B = 1)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. (c) [2 points] The Naive Bayes algorithm selects the class c for an example x that maxi- mizes P(c|x). Suppose one of your classmates stated that it is equivalent to selecting the c that maximizes P(x|c) under an assumption. What is the assumption that he has made?', 11, 7, NULL::jsonb, NULL, NULL, 'Problem 3 [11 points] Conditional Probability and Bayes Classifier (a) [2 points] Given the following probabilities: ˆ P(Good course | Desmond is in the course) = 0.5 ˆ P(Good course | Desmond is not in the course) = 0.3 ˆ P(Desmond in a randomly chosen course) = 0.1 What is P(Desmond is in the course | Not a good course)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and de- nominator), e.g. 1/2. Answer: Let D be Desmond is in the course, G be a good course P(D|NOT G) = P(D, NOT G) P(Not G) = P(Not G|D)P(D) P(Not G|D)P(D) + P(Not G|Not D)P(Not D) = (1 −0.5) × 0.1 (1 −0.5) × 0.1 + (1 −0.3) × (1 −0.1) = 5 Marking scheme: ˆ 2 points for giving the correct answer. (b) [7 points] Suppose you are given the following set of data with three Boolean input variables A, B, C, and a single Boolean output variable D. A B C D (i) According to the Naive Bayes classifier, what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. Answer: P(D = 1|A = 1, B = 1, C = 0) = P(D = 1)P(A = 1|D = 1)P(B = 1|D = 1)P(C = 0|D = 1) P1 j=0 P(D = j)P(A = 1|D = j)P(B = 1|D = j)P(C = 0|D = j) = (4/8)(2/4)(1/4)(2/4) (4/8)(2/4)(2/4)(1/4) + (4/8)(2/4)(1/4)(2/4) = 1 Marking scheme: ˆ 1.75 points for giving the correct answer. (ii) According to the Naive Bayes classifier, what is P(D = 0|A = 1, B = 1)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. Answer: P(D = 0|A = 1, B = 1) = P(D = 0)P(A = 1|D = 0)P(B = 1|D = 0) P1 j=0 P(D = j)P(A = 1|D = j)P(B = 1|D = j) = (4/8)(2/4)(2/4) (4/8)(2/4)(2/4) + (4/8)(2/4)(1/4) = 2 Marking scheme: ˆ 1.75 points for giving the correct answer. (iii) According to the general Bayes classifier (i.e. without independence assumption), what is P(D = 1|A = 1, B = 1, C = 0)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. Answer: P(D = 1|A = 1, B = 1, C = 0) = 0 as there is no data with A = 1, B = 1, C = 0 and D = 1 in the dataset. Marking scheme: ˆ 1.75 points for giving the correct answer. (iv) According to the general Bayes classifier (i.e. without independence assumption), what is P(D = 1|A = 1, B = 1)? If your answer is not an integer, write your answer in fraction form (use / to separate your numerator and denominator), e.g. 1/2. Answer: P(D = 1|A = 1, B = 1) = 1/2 as number of data with A = 1, B = 1, D = 1 is 1, and number of data with A = 1, B = 1 is 2. Marking scheme: ˆ 1.75 points for giving the correct answer. (c) [2 points] The Naive Bayes algorithm selects the class c for an example x that maxi- mizes P(c|x). Suppose one of your classmates stated that it is equivalent to selecting the c that maximizes P(x|c) under an assumption. What is the assumption that he has made? Answer: P(c|x) = P(x|c)P(c) P(x) , so finding the c that maximizes P(c|x) is equivalent to finding c that maximizes P(x|c), if the prior P(c) is uniform. Marking scheme: ˆ 2 points for stating the assumption correctly.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors (a) [3 points] Consider a set of 5 training data given as ((xTrain , xTrain ), cTrain) values, where xTrain and xTrain are the two attribute values (positive integers) and cTrain is the binary class label, A or B: { ((6,7),A), ((4,8),B), ((6,5),B), ((7,9),A), ((2,4),A) } Classify a test example (xTest , xTest ) with attribute values (4,7) using a KNN classifier with K = 3 and Manhattan distance defined by distance(xTrain, xTest) = P2 i=1 |xTrain i −xTest i | where | · | denote absolute value. Complete the following table by filling in the computed Manhattan distance between each training data point and the test example. Determine the class label based on the results. xTrain xTrain cTrain Distance A B B A A (b) [6 points] Judge whether each of the following student’s claims is correct or not. Explain why. (i) [3 points] A student claims that the results of a general KNN classifier that uses Euclidean distance will change if we multiply all attribute values of each training and test data point by 0.5. (ii) [3 points] Another student claims that the classification accuracy of the training set will always increase if the value of K used in KNN classifier is incrementally increased from 1 to N, where N is the total number of training examples. (c) [5 points] Consider KNN using Euclidean distance on the following data set. Each point belongs to one of the two classes: + and x. (i) [2 points] Perform 10-fold cross validation on the given data set (i.e. the 10 data points as shown in the figure), what is the validation error when using 1-nearest neighbor? (ii) [3 points] Which of the following values of K leads to the minimum 10-fold cross vali- dation error: 3, 5 or 9? What is the error for that K? If there is a tie, please elaborate.', 14, 9, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors (a) [3 points] Consider a set of 5 training data given as ((xTrain , xTrain ), cTrain) values, where xTrain and xTrain are the two attribute values (positive integers) and cTrain is the binary class label, A or B: { ((6,7),A), ((4,8),B), ((6,5),B), ((7,9),A), ((2,4),A) } Classify a test example (xTest , xTest ) with attribute values (4,7) using a KNN classifier with K = 3 and Manhattan distance defined by distance(xTrain, xTest) = P2 i=1 |xTrain i −xTest i | where | · | denote absolute value. Complete the following table by filling in the computed Manhattan distance between each training data point and the test example. Determine the class label based on the results. xTrain xTrain cTrain Distance A B B A A Answer: xTrain xTrain cTrain Distance A B B A A The predicted class label of the test example is B. Marking scheme: ˆ 0.5 point for giving each correct distance value. 2.5 points in total. ˆ 0.5 point for giving the correct class label. (b) [6 points] Judge whether each of the following student’s claims is correct or not. Explain why. (i) [3 points] A student claims that the results of a general KNN classifier that uses Euclidean distance will change if we multiply all attribute values of each training and test data point by 0.5. Answer: The claim is false, because K nearest neighbors will remain unchanged after multi- plying all attribute values of each training and test data points by 0.5. Marking scheme: ˆ 1 point for stating the claim is false. ˆ 2 points for giving the correct explanation. Remark: ˆ 1 point given if stating the claim is correct but did mention that the change on distance will be a multiplication of constant to the old distance AND didn’t state result of KNN classifier will change. ˆ 2 points given if stating the claim is correct but mention the change on distance will be a multiplication and state the result wont change. ˆ 0 point if correct deduction but draw the conclusion that result will change. ˆ 1 point for explanation if treat the question as asking multiply the values on Manhattan and answered correctly. ˆ 0 point for explanation if treat the question as asking comparison between Man- hattan and euclidean distance. (ii) [3 points] Another student claims that the classification accuracy of the training set will always increase if the value of K used in KNN classifier is incrementally increased from 1 to N, where N is the total number of training examples. Answer: The claim is false. A counterexample is as follows: The training set accuracy when K = 1 will be 100%. As K approaches the total number of training examples more and more examples influence the class, and eventually the class will always be the majority class in the training set. Marking scheme: ˆ 1 point for stating the claim is false. ˆ 2 points for giving the correct explanation. Remarks: ˆ Give full mark if they misregard outliers as further neighbors. ˆ -1 point if they mention is due to outliers but didn’t explain what is regarded as outliers. ˆ -2 points if they really mean outliers. ˆ -1 point if they only state larger k will cover more neighbors (this is just explain- ing what does a larger k means but not explaining why larger k can lower the accuracy) (*further neighbors is accepted but more neighbors is not) ˆ Other accept answers: i. Larger k include further neighbors which have low relevancy. ii. Underfitting iii. Giving a concrete situation that the claim is wrong. (c) [5 points] Consider KNN using Euclidean distance on the following data set. Each point belongs to one of the two classes: + and x. (i) [2 points] Perform 10-fold cross validation on the given data set (i.e. the 10 data points as shown in the figure), what is the validation error when using 1-nearest neighbor? Answer: Every point is misclassified. So, the validation error is 10/10. Marking scheme: ˆ 2 points for giving the correct validation error. Note: Both 10/10 and 100% are accepted as the correct answers. (ii) [3 points] Which of the following values of K leads to the minimum 10-fold cross vali- dation error: 3, 5 or 9? What is the error for that K? If there is a tie, please elaborate. Answer: All 3 values of K mis-classify all of the points and have the same classification errors, 10/10. Marking scheme: ˆ 2 points for stating all 3 values of K have the same classification errors. ˆ 1 point for giving the correct error. Note: Both 10/10 and 100% are accepted as the correct answers.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [12 points] K-Means Clustering Given a 1-dimensional data set {0, 3, 6, 9, 27, 30}, use the K-means clustering algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3 and c2 = 4 are chosen as the initial cluster centers. (a) [4.5 points] Perform one iteration of K-means clustering by finding the Euclidean distance between each data point in the data set and the centroids, and assign each data point to the closest centroid according to the distance found. Fill in the following table with your computed values. If your distance value is not an integer, write your answer in decimal form, e.g. 1.234. Data Point Distance between the data point and c1 = 3 Distance between the data point and c2 = 4 Closest Centroid (c1 or c2?) (b) [1.5 points] What are the values of c1 and c2 after one iteration of K-means? If your answer is not an integer, write your answer in decimal form, e.g. 1.234. (c) [4.5 points] Perform the second iteration of K-means clustering by finding the Euclidean distance between each data point in the data set and the computed centroids in part (b), and assign each data point to the closest centroid according to the distance found. Fill in the following table with your computed values. If your distance value is not an integer, write your answer in decimal form, e.g. 1.234. Data Point Distance between the data point and the c1 computed in part (b) Distance between the data point and the c2 computed in part (b) Closest Centroid (c1 or c2?) (d) [1.5 points] What are the values of c1 and c2 after the second iteration of K-means? If your answer is not an integer, write your answer in decimal form, e.g. 1.234.', 12, 11, NULL::jsonb, NULL, NULL, 'Problem 5 [12 points] K-Means Clustering Given a 1-dimensional data set {0, 3, 6, 9, 27, 30}, use the K-means clustering algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3 and c2 = 4 are chosen as the initial cluster centers. (a) [4.5 points] Perform one iteration of K-means clustering by finding the Euclidean distance between each data point in the data set and the centroids, and assign each data point to the closest centroid according to the distance found. Fill in the following table with your computed values. If your answer is not an integer, write your answer in decimal form, e.g. 1.234. Data Point Distance between the data point and c1 = 3 Distance between the data point and c2 = 4 Closest Centroid (c1 or c2?) Answer: Data Point Distance between the data point and c1 = 3 Distance between the data point and c2 = 4 Closest Centroid (c1 or c2?) c1 c1 c2 c2 c2 c2 Marking scheme: ˆ 0.25 for giving each correct value. 4.5 points in total. (b) [1.5 points] What are the values of c1 and c2 after one iteration of K-means? If your answer is not an integer, write your answer in decimal form, e.g. 1.234. Answer: c1 = 0 + 3 = 1.5 c2 = 6 + 9 + 27 + 30 = 18 Marking scheme: ˆ 0.75 for giving each correct centroid. 1.5 points in total. (c) [4.5 points] Perform the second iteration of K-means clustering by finding the Euclidean distance between each data point in the data set and the computed centroids in part (b), and assign each data point to the closest centroid according to the distance found. Fill in the following table with your computed values. If your answer is not an integer, write your answer in decimal form, e.g. 1.234. Data Point Distance between the data point and the c1 computed in part (b) Distance between the data point and the c2 computed in part (b) Closest Centroid (c1 or c2?) Answer: Data Point Distance between the data point and the c1 computed in part (b) Distance between the data point and the c2 computed in part (b) Closest Centroid (c1 or c2?) 1.5 c1 1.5 c1 4.5 c1 7.5 c1 25.5 c2 28.5 c2 Marking scheme: ˆ 0.25 for giving each correct value. 4.5 points in total. (d) [1.5 points] What are the values of c1 and c2 after the second iteration of K-means? If your answer is not an integer, write your answer in decimal form, e.g. 1.234. Answer: c1 = 0 + 3 + 6 + 9 = 4.5 c2 = 27 + 30 = 28.5 Marking scheme: ˆ 0.75 for giving each correct centroid. 1.5 points in total.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [20 points] Perceptron Given the following training dataset: x1 0.5 x2 0.5 T -1 -1 -1 (a) [8 points] Show the action of the perceptron algorithm for the above sequence of data points by completing the following table. Assume η = 1 and we start with the following initial weights and bias w1 = 1 w2 = 1 θ = 0 and use the following activation function. f(z) =    z ≥0 −1 otherwise Updating rules: ∆wi = η(T −O)xi ∆θ = η(T −O) wi = wi + ∆wi θ = θ + ∆θ where i ∈{1, 2}. If your answer is not an integer, write your answer in decimal form, e.g. 1.234. x1 x2 T O ∆w1 w1 ∆w2 w2 ∆θ θ - - - - - - - (b) [2 points] According to the values in the table above, state whether the perceptron algorithm is converged in 1 epoch. If not, explain why. (c) [10 points] Write a Python program to verify your answers of Part (a). In your program, you need to define the following variables. ˆ A 2D NumPy array, X, to store all the attribute values x1, x2, where the shape is (8,2) ˆ A 1D NumPy array, T,to store the target values, where the shape is (8,) ˆ A 1D NumPy array, W, to store the weights, where the shape is (2,) ˆ A float bias value, b. and perform the required computations. Also, print the following sequence of values (in exact order) for each iteration: x1 < s > x2 < s > T < s > O < s > ∆w1 < s > w1 < s > ∆w2 < s > w2 < s > ∆θ < s > θ < end > where < s > refers to an empty space, and < end > refers to an end of line character. The following shows a line of sample output. 0 0 0 0 0 0 0 0 0 0 Remark: You cannot use any other libraries other than NumPy in your program.', 20, 12, NULL::jsonb, NULL, NULL, 'Problem 6 [20 points] Perceptron Given the following training dataset: x1 0.5 x2 0.5 T -1 -1 -1 (a) [8 points] Show the action of the perceptron algorithm for the above sequence of data points by completing the following table. Assume η = 1 and we start with the following initial weights and bias w1 = 1 w2 = 1 θ = 0 and use the following activation function. f(z) =    z ≥0 −1 otherwise Updating rules: ∆wi = η(T −O)xi ∆θ = η(T −O) wi = wi + ∆wi θ = θ + ∆θ where i ∈{1, 2}. If your answer is not an integer, write your answer in decimal form, e.g. 1.234. x1 x2 T O ∆w1 w1 ∆w2 w2 ∆θ θ - - - - - - - Answer: x1 x2 T O ∆w1 w1 ∆w2 w2 ∆θ θ - - - - - - - -1 -2 -2 -2 -1 -6 -5 -6 -5 -2 -4 -1 -2 0.5 0.5 -1 -1 -1 -2 -4 -4 -4 Marking scheme: ˆ 0.1 point for giving each correct value. 8 points in total. (b) [2 points] According to the values in the table above, state whether the perceptron al- gorithm is converged in 1 epoch. If not, explain why. Answer: The algorithm is not converged in 1 epoch. Since there are changes of weights and biases. Marking scheme: ˆ 1 point for stating the algorithm is not converged. ˆ 1 point for giving a correct explanation. (c) [10 points] Write a Python program to verify your answers of Part (a). In your program, you need to define the following variables ˆ A 2D NumPy array, X, to store all the attribute values x1, x2, where the shape is (8,2) ˆ A 1D NumPy array, T,to store the target values, where the shape is (8,) ˆ A 1D NumPy array, W, to store the weights, where the shape is (2,) ˆ A float bias value, b. and perform the required computations. Also, print the following sequence of values (in exact order) for each iteration: x1 < s > x2 < s > T < s > O < s > ∆w1 < s > w1 < s > ∆w2 < s > w2 < s > ∆θ < s > θ < end > where < s > refers to an empty space, and < end > refers to an end of line character. The following shows a line of sample output. 0 0 0 0 0 0 0 0 0 0 Remark: You cannot use any other libraries other than NumPy in your program. Answer: import numpy as np # 0.5 point X = np.array([[10,10], [0,0], [8,4], [3,3], [4,8], [0.5,0.5], [4,3], [2,5]]) # 1 point T = np.array([1,-1,1,-1,1,-1,1,1]) # 0.5 point W = np.array([1,1], dtype=float) # 1 point b = 0.0 # 0.5 point for i in range(X.shape[0]): # 1 point y = X[i].dot(W) + b # 1 point if y >= 0: # 0.5 point O = 1 # 0.5 point else: O = -1 # 0.5 point DW = (T[i] - O) * X[i] # 0.5 point W += DW # 0.5 point Db = (T[i] - O) # 0.5 point b += Db # 0.5 point print(X[i][0], X[i][1], T[i], O, DW[0], W[0], DW[1], W[1], Db, b) # 1 point Remarks: ˆ -0.25 point for forgetting to specify W = np.array([1,1], dtype=float) ˆ Can also b = 0, Python will duck-type into float when needed. ˆ -0.25 point each for not using NumPy array. No overlap with W float penalty, since Python list supports duck-typing. ˆ -0.25 point each for wrong array shape. ˆ -0.5 point for hard-coding for i in range(8):. ˆ -0.25 point for forgetting bias in the output calculation. ˆ -0.25 point for minor print formatting errors. ˆ -0.25 point for incorrect print when output == target. ˆ -0.25 point each for miscellaneous syntax errors. ˆ -1 flat point for defining as class/function(s) but not calling. ˆ -10 floor point for using SKlearn.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'hard', '', '', ''), ('COMP2211-2022-spring-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [12 points] Perceptron and Multilayer Perceptron (a) [4 points] Can we represent the given boolean function with a single neuron as shown below? A B f(A,B) If yes, show the possible weights, bias and activation function that can be used to com- pute the output of all the data points correctly. If not, explain why not in 1-2 sentences. (b) [6 points] Suppose we have a neural network as shown below with θ1 = 0, θ2 = 0, θ3 = 0, and linear activation function (i.e. f(x) = Cx, where C is a constant). Can any function that is represented by the above network be represented by a single unit of artificial neural network in the following diagram? If so, detail the weights (w7 and w8), bias (θ), and the activation function f(x). Otherwise, explain why not. (c) [2 points] Given a multilayer perceptron with 3 layers (input layer, hidden layer and output layer). The number of units in each of the these layers are 3, 4, 2. Assume each input or neuron is fully-connected. Calculate the number of trainable parameters of this network. -------------------- END OF PAPER --------------------', 12, 14, NULL::jsonb, NULL, NULL, 'Problem 7 [12 points] Perceptron and Multilayer Perceptron (a) [4 points] Can we represent the given boolean function with a single neuron as shown below? A B f(A,B) If yes, show the possible weights, bias and activation function that can be used to com- pute the output of all the data points correctly. If not, explain why not in 1-2 sentences. Answer: Yes, we can represent this function with a single neuron, since it is linearly separable. One set of possible weights and bias is: w1 = 1, w2 = −1, θ = −0.5, and the activation function is f(x) =    x > 0 otherwise Marking scheme: ˆ 1 point for stating it is possible to represent the given boolean function with a single neuron. ˆ 1 point for showing the “standard” activation function. ˆ 2 points for correct values of w1, w2, and θ, given the activation function is the “standard” one. ˆ If the given activation function is not “standard”, but the parameters are suitable, also get 2 points. ˆ 0 point for stating not possible to represent. (b) [6 points] Suppose we have a neural network as shown below with θ1 = 0, θ2 = 0, θ3 = 0, and linear activation function (i.e. f(x) = Cx, where C is a constant). Can any function that is represented by the above network be represented by a single unit of artificial neural network in the following diagram? If so, detail the weights (w7 and w8), bias (θ), and the activation function f(x). Otherwise, explain why not. Answer: Yes, the network can be represented by a single unit of artificial neural network by setting its weights, w7 = w1w5 + w3w6, w8 = w2w5 + w4w6, and the activation function f(x) = C2. Marking scheme: ˆ 0.5 point for stating any function that is represented by the MLP can be represented by a single unit of artificial neural network. ˆ 1.5 points for showing each possible weight w7, w8. 3 points in total. ˆ 1 point for showing a possible bias value, i.e. 0. ˆ 1.5 for showing the activation function. Remarks: ˆ If stating NO, -6 points. But among those NO’s, if students put the expression OUTPUT=C2w5(x1w1 + x2w2 + θ1) + C2w6(x1w3 + x2w4 + θ2) + Cθ3, give 1 point. ˆ For those stating YES, if missing some value, deduct the score of that value. ˆ If the order of C is wrong, e.g. one C in w7w8 and no C in activation, or C2 in w7w8 and one C in activation, -1 point. ˆ For other wrong values, -full mark for that values. ˆ If didn’t plug in given values and the expression is wrong, even if the expression may evaluate to the same value as the answer, -full mark for that expression or -1 point if it’s an order-of-C issue. ˆ If didn’t plug in given values, e.g. θs=0, the same C for all units (some use C1C2C3), but otherwise correct, -0.5 for each of θ & C. (c) [2 points] Given a multilayer perceptron with 3 layers (input layer, hidden layer and output layer). The number of units in each of the these layers are 3, 4, 2. Assume each input or neuron is fully-connected. Calculate the number of trainable parameters of this network. Answer: 3 ∗4 + 4 + 4 ∗2 + 2 = 26. Marking scheme: ˆ 2 points for giving the correct answer. -------------------- END OF PAPER --------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-final-part-a', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 1 point for each correct answer. (a) One drawback of the K-means algorithm is that one needs to specify exactly how many clusters the algorithm should find. (b) Increasing the number of hidden layers always increases the model performance. (c) When handling a binary classification task, both softmax and sigmoid functions can be used as the activation function in the output layer. (d) Validation accuracy must be lower than training accuracy. (e) We cannot train/inference deep learning networks using CPU. (f) Otsu’s thresholding method and affine transformations are point-based image operations. (g) In the convolutional layer of a CNN, the number of weights depends on the depth of the input volume and the number of biases is equal to the number of kernels. (h) After training a neural network, you observe a large gap between the training accuracy (100%) and the task accuracy (40%). Dropout is commonly used to reduce this gap. (i) In a minimax-based 3×3 tic-tac-toe game, an AI player will definitely win because it knows all possible moves of the game. (j) The alpha-beta pruning algorithm is preferred to minimax because it computes the same answer as minimax while usually doing so without examining as much of the game tree. Question Answer (T/F) (a) (b) (c) (d) (e) (f) (g) (h) (i) (j)', 10, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 1 point for each correct answer. (a) One drawback of the K-means algorithm is that one needs to specify exactly how many clusters the algorithm should find. (b) Increasing the number of hidden layers always increases the model performance. (c) When handling a binary classification task, both softmax and sigmoid functions can be used as the activation function in the output layer. (d) Validation accuracy must be lower than training accuracy. (e) We cannot train/inference deep learning networks using CPU. (f) Otsu’s thresholding method and affine transformations are point-based image operations. (g) In the convolutional layer of a CNN, the number of weights depends on the depth of the input volume and the number of biases is equal to the number of kernels. (h) After training a neural network, you observe a large gap between the training accuracy (100%) and the task accuracy (40%). Dropout is commonly used to reduce this gap. (i) In a minimax-based 3×3 tic-tac-toe game, an AI player will definitely win because it knows all possible moves of the game. (j) The alpha-beta pruning algorithm is preferred to minimax because it computes the same answer as minimax while usually doing so without examining as much of the game tree. Answer: Question Answer (T/F) (a) T (b) F (c) T (d) F (e) F (f) F (g) T (h) T (i) F (j) T Marking scheme: ˆ 1 point for giving each correct answer. 10 points in total.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-final-part-a', '2', NULL, 2, 'long_question', 'long_answer', 'Problem 2 [14 points] Na¨ıve Bayes and K-Nearest Neighbors Given the training data in the table below. No. CGPA Interest in Computing Subjects Practice-oriented Learner? COMP 2211 Grade Select COMP as Major ≤3 High No B No ≤3 High No A No > 3 AND ≤4 High No B Yes > 4 Medium No B Yes > 4 Low Yes B Yes > 4 Low Yes A No > 3 AND ≤4 Low Yes A Yes ≤3 Medium No B No ≤3 Low Yes B Yes > 4 Medium Yes B Yes ≤3 Medium Yes A Yes > 3 AND ≤4 Medium No A Yes > 3 AND ≤4 High Yes B Yes > 4 Medium No A No (a) [6.5 points] Given a new example with the following attribute values. Predict the value of its “Select COMP as Major” using Na¨ıve Bayes classifier. Show all the steps. ˆ CGPA ≤3 ˆ Interest in Computing Subjects = Medium ˆ Practice-oriented Learner = Yes ˆ COMP 2211 Grade = B (b) [7.5 points] Similar to the above, but this time, predict the value of its “Select COMP as Major” using K-nearest neighbor for K = 5. Complete the following table and state the prediction result based on the data in the completed table. For similarity measure, use a simple match of attribute values: S(ai, bi) = X i=1 wi ∗distance(ai, bi) where distance(ai, bi) is 0 if ai equals bi, and 1 otherwise. ai and bi are either CGPA, interest in computing subjects, practice-oriented learner or COMP 2211 grade. Weights, wi, are all 1 except for interest in computing subjects, it is 2. No. Class Distance to New Example No No Yes Yes Yes No Yes No Yes Yes Yes Yes Yes No', 14, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [14 points] Na¨ıve Bayes and K-Nearest Neighbors Given the training data in the table below. No. CGPA Interest in Computing Subjects Practice-oriented Learner? COMP 2211 Grade Select COMP as Major ≤3 High No B No ≤3 High No A No > 3 AND ≤4 High No B Yes > 4 Medium No B Yes > 4 Low Yes B Yes > 4 Low Yes A No > 3 AND ≤4 Low Yes A Yes ≤3 Medium No B No ≤3 Low Yes B Yes > 4 Medium Yes B Yes ≤3 Medium Yes A Yes > 3 AND ≤4 Medium No A Yes > 3 AND ≤4 High Yes B Yes > 4 Medium No A No (a) [6.5 points] Given a new example with the following attribute values. Predict the value of its “Select COMP as Major” using Na¨ıve Bayes classifier. Show all the steps. ˆ CGPA ≤3 ˆ Interest in Computing Subjects = Medium ˆ Practice-oriented Learner = Yes ˆ COMP 2211 Grade = B Answer: Let ˆ E be CGPA ≤3, Interest in Computing Subjects = Medium, Practice-oriented Learner = Yes, COMP 2211 Grade = B ˆ E1 be CGPA ≤3 ˆ E2 be interest in computing subjects = medium ˆ E3 be practice-oriented learner = yes ˆ E4 be COMP 2211 grade = B P(Y es|E) = P(E1|Y es)P(E2|Y es)P(E3|Y es)P(E4|Y es)P(Y es) P(E) P(Y es) = 9/14 = 0.643 P(E1|Y es) = 2/9 = 0.222 P(E2|Y es) = 4/9 = 0.444 P(E3|Y es) = 6/9 = 0.667 P(E4|Y es) = 6/9 = 0.667 P(Y es|E) = (0.222)(0.444)(0.667)(0.668)(0.443) P(E) = 0.028 P(E) P(No|E) = P(E1|No)P(E2|No)P(E3|No)P(E4|No)P(No) P(E) P(No) = 5/14 = 0.356 P(E1|No) = 3/5 = 0.6 P(E2|No) = 2/5 = 0.4 P(E3|No) = 1/5 = 0.2 P(E4|No) = 2/5 = 0.4 P(No|E) = (0.6)(0.4)(0.2)(0.4)(0.357) P(E) = 0.007 P(E) Hence, the Na¨ıve Bayes classifier predicts “Select COMP as Major” = Yes for the new example. Marking scheme: ˆ 0.5 for giving each conditional and prior probability. 6 points in total. ˆ 0.5 for giving the correct prediction. (b) [7.5 points] Similar to the above, but this time, predict the value of its “Select COMP as Major” using K-nearest neighbor for K = 5. Complete the following table and state the prediction result based on the data in the completed table. For similarity measure, use a simple match of attribute values: S(ai, bi) = X i=1 wi ∗distance(ai, bi) where distance(ai, bi) is 0 if ai equals bi, and 1 otherwise. ai and bi are either CGPA, interest in computing subjects, practice-oriented learner or COMP 2211 grade. Weights, wi, are all 1 except for interest in computing subjects, it is 2. No. Class Distance to New Example No 0 + 2 + 1 + 0 = 3 No 0 + 2 + 1 + 1 = 4 Yes 1 + 2 + 1 + 0 = 4 Yes 1 + 0 + 1 + 0 = 2 Yes 1 + 2 + 0 + 0 = 3 No 1 + 2 + 0 + 1 = 4 Yes 1 + 2 + 0 + 1 = 4 No 0 + 0 + 1 + 0 = 1 Yes 0 + 2 + 0 + 0 = 2 Yes 1 + 0 + 0 + 0 = 1 Yes 0 + 0 + 0 + 1 = 1 Yes 1 + 0 + 1 + 1 = 3 Yes 1 + 2 + 0 + 0 = 3 No 1 + 0 + 1 + 1 = 3 Among the 5 nearest neighbors, 4 are from class Yes, and 1 from class No. Hence, the KNN classifier predicts “Select COMP as Major” = Yes for the new example. Marking scheme: ˆ 0.5 point for giving each correct answer. 7 points in total. ˆ 0.5 point for giving the correct prediction.', ARRAY['Probabilistic Models', 'KNN and Clustering']::TEXT[], 'Probabilistic Models', NULL, ARRAY['Probabilistic Models', 'KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-final-part-a', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [14 points] Multilayer Perceptron (MLP) This problem is about multilayer perceptron (MLP). Answer all the questions below. (a) [3 points] State when using the F1 metric is better than using accuracy as an evaluation metric. Also use a confusion matrix to illustrate the stated situation. (b) [2 points] Suppose we are dealing with binary classification tasks using MLP. Explain why it is inappropriate to use ReLU as the activation function in the output layer. (c) [2 points] Design the output layer of an MLP for handling a multilabel classification problem with n classes by stating the number of neurons and the activation function. Remark: Multilabel classification is a supervised learning problem where an instance may be associated with multiple labels. (d) [3 points] Explain why it is not good to initialize all weights of an MLP to zero. Hint: Refer to the following updating rules of weights and biases for MLP. ˆ δk = (Ok −Tk)Ok(1 −Ok) ˆ δj = Oj(1 −Oj) P k∈K δkwjk ˆ wjk ←wjk −ηδkOj ˆ wij ←wij −ηδjOi ˆ θj ←θj −ηδj ˆ θk ←θk −ηδk (e) [2 points] Explain what will happen if the learning rate η of an MLP is (i) Too large (ii) Too small (f) [2 points] Describe a way to avoid overfitting in MLP. Explain why it works.', 14, 6, NULL::jsonb, NULL, NULL, 'Problem 3 [14 points] Multilayer Perceptron (MLP) This problem is about multilayer perceptron (MLP). Answer all the questions below. (a) [3 points] State when using the F1 metric is better than using accuracy as an evaluation metric. Also use a confusion matrix to illustrate the stated situation. Answer: ˆ When the dataset is unbalanced. ˆ When false-negative and false-positive matter a lot. Actual/Predicted Infectious disease=yes Infectious disease=no Infectious disease=yes Infectious disease=no Accuracy = 0.8016 F1 score = 0.0741 Marking scheme: ˆ 1 point for stating the situation ˆ 2 points for the confusion matrix (b) [2 points] Suppose we are dealing with binary classification tasks using MLP. Explain why it is inappropriate to use ReLU as the activation function in the output layer. Answer: We cannot determine the cut-off threshold to distinguish between the output classes when there is an unbounded output range. Marking scheme: ˆ 2 points for explaining by the “unbounded” range of ReLU so cannot determine the cut-off ˆ 1 point if mentioning the range of function (*not describing what ReLU do, but stating the range) but no further explanation ˆ 0 point if only stating ReLU can output value more than 0 & 1 without mentioning it has an unbounded range (bounded function beyond the range from 0 to 1 can work, just do the mapping) (c) [2 points] Design the output layer of an MLP for handling a multilabel classification problem with n classes by stating the number of neurons and the activation function. Remark: Multilabel classification is a supervised learning problem where an instance may be associated with multiple labels. Answer: n neurons and sigmoid function. Marking scheme: ˆ 1 point for n neurons ˆ 1 point for sigmoid (d) [3 points] Explain why it is not good to initialize all weights of an MLP to zero. Hint: Refer to the following updating rules of weights and biases for MLP. ˆ δk = (Ok −Tk)Ok(1 −Ok) ˆ δj = Oj(1 −Oj) P k∈K δkwjk ˆ wjk ←wjk −ηδkOj ˆ wij ←wij −ηδjOi ˆ θj ←θj −ηδj ˆ θk ←θk −ηδk Answer: If a network is initialized with all zeros, all the neurons will propagate on the same gradient, making different neurons learn the same features. Thus, this leads to poor performance. Marking scheme: ˆ 3 points if “the same update/propagate/feature learned/symmetry problem” ˆ 2 points if “zero updates” as it isn’t always true for all MLP design ˆ 2 points if stating only gradient computation are the same for neurons ˆ 1 point if only mentioning deltaj will become zero / stating gradient “may” not update ˆ 0 point for low efficiency / poor performance (e) [2 points] Explain what will happen if the learning rate η of an MLP is (i) Too large (ii) Too small Answer: (i) ˆ Cause the model to coverage too quickly to a sub-optimal solution. ˆ Unstable training like oscillations. Marking scheme: ˆ 1 point for explaining what will happen if the learning rate is too large. ˆ not accept learn faster/overfit/low accuracy (ii) Learning will be slow. Marking scheme: ˆ 1 point for explaining what will happen if the learning rate is too small. ˆ not accept underfit (f) [2 points] Describe a way to avoid overfitting in MLP. Explain why it works. Answer: Adding regulations helps to keep the weights small, such that it is less likely for the model to have a large variance (i.e. be sensitive to noise and fluctuations in data). Marking scheme: ˆ 1 point for method ˆ 1 point for explanation ˆ only valid explanation (but not stating the definition or recalling some rule of thumbs) get the explanation point', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-final-part-a', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [15 points] Digital Image Processing (a) Assume we apply the following kernel to an 8-bit grayscale image. K =   −1 −2 −2   (i) [2 points] Determine the maximum and minimum possible values that a pixel to which the given kernel is applied can have. Do not perform any normalization. (ii) [2 points] Suggest a grey-level transformation function to ensure that any output of this kernel will be within the legal range of a standard 8-bit grayscale image. (b) Consider the following 2 × 2 image: Apply the following image operations sequentially. (i) [3 points] Show the resulting image of size 4 × 4 after adding reflection padding. (ii) [4 points] Apply a 3×3 averaging kernel to the resulting image of part (b)(i). Assume the output image after image averaging is in the same shape as the input image by doing zero padding. Round the number to integer if needed. (iii) [4 points] Compute the optimal threshold after applying ONE ITERATION of Otsu’s method on the resulting image of part (b)(ii). Assume the initial threshold is set to the mean pixel intensity of the resulting image.', 15, 7, NULL::jsonb, NULL, NULL, 'Problem 4 [15 points] Digital Image Processing (a) Assume we apply the following kernel to an 8-bit grayscale image. K =   −1 −2 −2   (i) [2 points] Determine the maximum and minimum possible values that a pixel to which the given kernel is applied can have. Do not perform any normalization. Answer: Since an 8-bit grayscale image is assumed, the highest value that each pixel can have is 255, and the lowest value is 0. After applying the kernel, the maximum value is achieved when the negative values of the kernel multiply 0s and the pos- itive values of the kernel multiply 255s. Then, the maximum achievable value is vmax = (2 + 2 + 1)(255) = 1275. Following the same reasoning, the minimum value will be vmin = (−2 −2 −1)(255) = −1275. Marking scheme: ˆ 1 point for stating the maximum possible value. ˆ 1 point for stating the minimum possible value. (ii) [2 points] Suggest a grey-level transformation function to ensure that any output of this kernel will be within the legal range of a standard 8-bit grayscale image. Answer: A 8-bit image must have a range from 0 to 255. Since the maximum and minimum possible values for the gray levels are vmax = 1275 and vmin = −1275, respectively, the function will be Ioutput = 255 Iinput −vmin vmax −vmin  = 255 Iinput −(−1275) 1275 −(−1275)  = 255 Iinput + 1275  where Iinput and Ioutput are the input and output images, respectively. Marking scheme: ˆ 2 points for stating a transformation function. ˆ -1 point if the function is merely returning legal range. (b) Consider the following 2 × 2 image: Apply the following image operations sequentially. (i) [3 points] Show the resulting image of size 4 × 4 after adding reflection padding. Answer: Marking scheme: ˆ 0.25 point for each correct value. 3 points in total. (ii) [4 points] Apply a 3 × 3 averaging kernel to the resulting image of part (b)(i). As- sume the output image after image averaging is in the same shape as the input image by doing zero padding. Round the number to integer if needed. Answer: Marking scheme: ˆ 0.25 point for each correct value. 4 points in total. (iii) [4 points] Compute the optimal threshold after applying ONE ITERATION of Otsu’s method on the resulting image of part (b)(ii). Assume the initial threshold is set to the mean pixel intensity of the resulting image. Answer: ˆ Initial threshold = (0 + 2 + 4 + 4 + 4 + 9 + 12 + 10 + 8 + 15 + 18 + 14 + 8 + 14 + 16 + 12)/16 = 9.375 ˆ µ1 = (0 + 2 + 4 + 4 + 4 + 9 + 8 + 8)/8 = 4.875 ˆ µ2 = (12 + 10 + 15 + 18 + 14 + 14 + 16 + 12)/8 = 13.875 ˆ Optimal threshold = (4.875 + 13.875) = 9.375 Marking scheme: ˆ 1 point for the answer of initial threshold. ˆ 1 point for the answer of µ1. ˆ 1 point for the answer of µ2. ˆ 1 point for the optimal threshold after 1 iteration. ˆ -1 point for incorrect input from part b(ii). I.e. incorrect calculation based on result of part b(ii.)', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-final-part-b', '1', NULL, 1, 'long_question', 'long_answer', 'Problem 1 [19 points] Convolutional Neural Network (CNN) (a) [1.5 points] Suppose there is a convolutional layer in a CNN with the following parame- ters. ˆ Kernel size: 3 × 3 ˆ Zero padding: 1-pixel border on each side ˆ Stride: 2-pixel in each direction Compute the output feature map of the convolutional layer with the following input im- age and kernel. Assume the number of channels for both input and output images is 1. Express your answer as a 2D nested list. E.g. [[2,2],[2,2]]. Input image: Kernel: (b) [1.5 points] Give an example of a data augmentation technique that would be useful for classifying images of cats and dogs, but not for classifying handwritten digits. Briefly explain your answer. (c) [4.5 points] Assume we have a CNN with the architecture as described in Table 1. Com- plete the table by filling in the input and output shape of each layer in the format: ‘height x width x channel’ (Note: A space is placed before and after the symbol ’x’). To illustrate that, the input shape of the 1st convolutional layer has been inserted for you. Formula: Convolution output size = [ (size of image dimension - size of kernel dimension + 2 × padding) / stride ] + 1 Layer (Size, Specifications) Input Shape Output Shape 7x7 conv, 64 kernels, stride 2, padding 3, with biases 224 x 224 x 3 3x3 max pooling, stride 2, padding 1 3x3 conv, 128 kernels, stride 2, with biases 3x3 conv, 256 kernels, stride 2, padding 1, with biases 3x3 conv, 512 kernels, stride 2, padding 1, with biases Table 1: Architecture of a CNN (d) [2 points] Assume we want to add a fully-connected layer at the back of the CNN as described in part (b). However, the output of the resulting network still contains a large number of parameters. State what we should do if we want to reduce the number of parameters and summarize the output. Hint: We want to turn N × N × 512 output to 1 × 1 × 512. Remark: Convolution layer is not an acceptable answer. (e) [2.5 points] Calculate the total number of parameters of the model? (i.e. for all the layers described in Table 1). (f) Assume the model should classify images into 1000 distinct classes by (i) [2.5 points] Appending the network shown in Table 2 to the end of the CNN network (i.e. the one described in Table 1) appended with the structure that you added in part (c). Layer (Size, Specifications) Input Shape Output Shape Flatten 1 x 1 x 512 Fully-connected (Flatten before feed-in) Table 2: Classification network Calculate the total number of parameters in the whole network (i.e. the layers described in Table 1, your answer in part (c), and the layers described in Table 2). (ii) [2.5 points] Now, suppose we use the MLP described in Table 3 to classify the images instead of CNN. Layer (Size, Specifications) Input Shape Output Shape Flatten 224 x 224 x 3 Fully-connected (Flatten before feed-in) Table 3: Classification using MLP Calculate the total number of parameters in the MLP (i.e. only those described in Table 3). (g) [2 points] State what you observe in part (e). Also, state the property of CNN, which leads to this difference.', 19, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [19 points] Convolutional Neural Network (CNN) (a) [1.5 points] Suppose there is a convolutional layer in a CNN with the following parame- ters. ˆ Kernel size: 3 × 3 ˆ Zero padding: 1-pixel border on each side ˆ Stride: 2-pixel in each direction Compute the output feature map of the convolutional layer with the following input im- age and kernel. Assume the number of channels for both input and output images is 1. Express your answer as a 2D nested list. E.g. [[2,2],[2,2]]. If your calculated answer is a floating-point number, convert it to an integer using the floor function. Input image: Kernel: Answer: [[73]] for using the unflipped kernel OR [[27]] for using the flipped kernel. Marking scheme: ˆ Accepted answers: [[73 (39, 43, 69)]], [[27 (61, 31, 57)]] (for flipped kernel) -1 point if the format is wrong. e.g., missing bracket(s). If there are more than 1 value, the correct answer will be instead a 2x2 array ex- tracted from the following arrays. There are two possible solutions: [A[1, 1], A[1, 2]], [A[2, 1], A[2, 2]] [A[1, 1], A[1, 3]], [A[3, 1], A[3, 3]] (stride 2 with extended zero padding) A = [[ 2, 16, 38, 28], [ 5, 27, 61, 53], [ 8, 31, 57, 60], [ 3, 13, 21, 27]] A = [[18, 44, 22, 12], [25, 73, 39, 17], [22, 69, 43, 10], [ 7, 27, 19, 3]] (this two array is generated by scipy.signal.convolve2d(in1, in2, mode=''full'', boundary=''fill'', fillvalue=0)) There are two possible arrays because traditionally, the former one is the “real” con- volution. I.e. the kernel or image is flipped along both axes before the convolving op- erations. However, at the latter one (the originally intended answer), it’s calculated WITHOUT the flipping. This operation is called the cross-correlation operation. (b) [1.5 points] Give an example of a data augmentation technique that would be useful for classifying images of cats and dogs, but not for classifying handwritten digits. Briefly explain your answer. Answer: Flipping the image horizontally. Doing this to a dog/cat image would be reasonable, but not so for an image of a handwritten digit. Marking scheme: ˆ 2 points for “rotate and reflections” as augmentation methods. ˆ 0 point for image processing methods. (c) [4.5 points] Assume we have a CNN with the architecture as described in Table 1. Com- plete the table by filling in the input and output shape of each layer in the format: ‘height x width x channel’ (Note: A space is placed before and after the symbol ’x’). To illustrate that, the input shape of the 1st convolutional layer has been inserted for you. Formula: Convolution output size = [ (size of image dimension - size of kernel dimension + 2 × padding) / stride ] + 1 If your calculated answer is a floating-point number, convert it to an integer using the floor function. Layer (Size, Specifications) Input Shape Output Shape 7x7 conv, 64 kernels, stride 2, padding 3, with biases 224 x 224 x 3 112 x 112 x 64 3x3 max pooling, stride 2, padding 1 112 x 112 x 64 56 x 56 x 64 3x3 conv, 128 kernels, stride 2, with biases 56 x 56 x 64 27 x 27 x 128 3x3 conv, 256 kernels, stride 2, padding 1, with biases 27 x 27 x 128 13 x 13 x 256 3x3 conv, 512 kernels, stride 2, padding 1, with biases 13 x 13 x 256 7 x 7 x 512 Table 1: Architecture of a CNN Marking scheme: ˆ 0.25 point for number of channels ˆ 0.25 point for height and width ˆ -1 point if the format is wrong, e.g. (c, h, w) instead of (h, w, c) (d) [2 points] Assume we want to add a fully-connected layer at the back of the CNN as described in part (b). However, the output of the resulting network still contains a large number of parameters. State what we should do if we want to reduce the number of parameters and summarize the output. Hint: We want to turn N × N × 512 output to 1 × 1 × 512. Remark: Convolution layer is not an acceptable answer. Answer: Max pooling or average pooling. Marking scheme: ˆ 2 points for giving the correct answer. ˆ -0.5 point if type of pooling wasn’t specified. ˆ -0.5 point if pooling is not mentioned. (e.g. taking average) As pooling is a standard component of neural networks. ˆ 1 point for resize. (e) [2.5 points] Calculate the total number of parameters of the model? (i.e. for all the layers described in Table 1). Answer: The total number of parameters =(7 × 7 × 3 × 64 + 64)+ (3 × 3 × 64 × 128 + 128)+ (3 × 3 × 128 × 256 + 256)+ (3 × 3 × 256 × 512 + 512) =9472 + 73856 + 295168 + 1180160 =1558656 Marking scheme: ˆ 2.5 point for giving the correct answer. (f) Assume the model should classify images into 1000 distinct classes by (i) [2.5 points] Appending the network shown in Table 2 to the end of the CNN network (i.e. the one described in Table 1) appended with the structure that you added in part (c). Layer (Size, Specifications) Input Shape Output Shape Flatten 1 x 1 x 512 Fully-connected (Flatten before feed-in) Table 2: Classification network Calculate the total number of parameters in the whole network (i.e. the layers de- scribed in Table 1, your answer in part (d), and the layers described in Table 2). Answer: The total number of parameters =1558656 + (512 × 1000 + 1000) =1558656 + 513000 =2071656 Marking scheme: ˆ 2.5 point for giving the correct answer. (ii) [2.5 points] Now, suppose we use the MLP described in Table 3 to classify the images instead of CNN. Layer (Size, Specifications) Input Shape Output Shape Flatten 224 x 224 x 3 Fully-connected (Flatten before feed-in) Table 3: Classification using MLP Calculate the total number of parameters in the MLP (i.e. only those described in Table 3). Answer: The total number of parameters =(150528 × 1000 + 1000) =150529000 Marking scheme: ˆ 2.5 point for giving the correct answer. (g) [2 points] State what you observe in part (f). Also, state the property of CNN, which leads to this difference. Answer: The number of parameters of CNN is significantly less than the number of parameters of pure MLP. This is because CNN uses shared parameters (kernels) to process the input instead of using 1 parameter for each individual input. Marking scheme: ˆ 1 point for observations. If the observation is wrong, explanation is ignored. ˆ 1 point for explanation if mentioned “large scale extraction” but “feature extraction” only gains 0.5 point. ˆ 1 point for “sparse connection” ˆ 0 point for reducing output size. As fully-connected layer could also reduce the output size for the following fc layer, and yet the number of parameter is even more bloated.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'hard', '', '', ''), ('COMP2211-2022-spring-final-part-b', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [13 points] Python Programming: Convolutional Neural Network Suppose the following CNN model learns human emotions from RGB images of faces and classifies into 12 emotion categories. Assumptions: All layers have no padding. Both two convolutional layers have 3 × 3 kernels. The model is trained with Adam optimizer in default learning rate, and the loss function is categorical cross-entropy. You don’t need to specify extra metrics like accuracy. (a) [9 points] According to all the information given above, write the Python codes to con- struct and compile the model using Keras library. The following import statements are provided for you. Also, a reference of useful Keras classes and functions are given in the appendix. from keras.models import Sequential from keras.layers import Conv2D, MaxPooling2D from keras.layers import Dense, Flatten (b) [4 points] Now instead of classification, a student wants to predict a single continuous real value from an input picture: how positive (or negative) the emotion is, ranging from 0 to 1. Is CNN in general able to do that? If yes, derive a model to complete such a task from part (a) model. Point out which parts you will change when building and compiling the model (if any) and explain how you will change it. (You don’t have to write codes, but state clearly your ideas). If no, also explain why. Appendix: Below are some Keras documentation for your reference. Some irrelevant parameters are omitted for conciseness. Sequential class tf.keras.Sequential(layers=None, name=None) Sequential groups a linear stack of layers into a tf.keras.Model ˆ add Method Sequential.add(layer) – layer: layer instance ˆ compile Method Model.compile(optimizer="rmsprop", loss=None) – optimizer: String (name of optimizer) or optimizer instance. – loss: Loss function. Conv2D class tf.keras.layers.Conv2D( filters, kernel_size, strides=(1, 1), padding="valid", activation=None, ) 2D convolution layer (e.g. spatial convolution over images). When using this layer as the first layer in a model, provide the keyword argument input shape (tuple of integers, does not include the sample axis), e.g. input shape=(128,128,3) for 128x128 RGB pictures in data format="channels last". ˆ filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). ˆ kernel size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. ˆ strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the height and width. Can be a single integer to specify the same value for all spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any dilation rate value != 1. ˆ padding: one of “valid” or “same” (case-insensitive). “valid” means no padding. “same” results in padding with zeros evenly to the left/right or up/down of the input. When padding="same" and strides=1, the output has the same size as the input. ˆ activation: Activation function to use. If you don’t specify anything, no activation is applied. MaxPooling2D class tf.keras.layers.MaxPooling2D( pool_size=(2, 2), strides=None, padding="valid", ) Max pooling operation for 2D spatial data. ˆ pool size: integer or tuple of 2 integers, window size over which to take the maximum. (2, 2) will take the max value over a 2x2 pooling window. If only one integer is specified, the same window length will be used for both dimensions. ˆ strides: Integer, tuple of 2 integers, or None. Strides values. Specifies how far the pooling window moves for each pooling step. If None, it will default to pool size. ˆ padding: One of “valid” or “same” (case-insensitive). “valid” means no padding. “same” results in padding evenly to the left/right or up/down of the input such that output has the same height/width dimension as the input. Flatten class tf.keras.layers.Flatten() Flattens the input. Dense class tf.keras.layers.Dense( units, activation=None, ) Regular densely-connected NN layer. ˆ units: Positive integer, dimensionality of the output space. ˆ activation: Activation function to use. If you don’t specify anything, no activation is applied. Common activation functions (in shorthand strings): “relu”, “sigmoid”, “softmax” Common loss functions (in shorthand strings): “categorical crossentropy”, “sparse categorical crossentropy”, “mean squared error” (same as “MSE”), “mean absolute error” (same as “MAE”) Common optimizer (in shorthand strings): “adam”', 13, 5, NULL::jsonb, NULL, NULL, 'Problem 2 [13 points] Python Programming: Convolutional Neural Network Suppose the following CNN model learns human emotions from RGB images of faces and classifies into 12 emotion categories. Assumptions: All layers have no padding. Both two convolutional layers have 3 × 3 kernels. The model is trained with Adam optimizer in default learning rate, and the loss function is categorical cross-entropy. You don’t need to specify extra metrics like accuracy. (a) [9 points] According to all the information given above, write the Python codes to con- struct and compile the model using Keras library. The following import statements are provided for you. Also, a reference of useful Keras classes and functions are given in the appendix. from keras.models import Sequential from keras.layers import Conv2D, MaxPooling2D from keras.layers import Dense, Flatten Answer: model = Sequential() model.add(Conv2D(filters=32, kernel_size=(3, 3), activation=''relu'', strides=(3,3), input_shape=(32, 32, 1))) model.add(Conv2D(filters=64, kernel_size=(3, 3), activation=''relu'', stride=(2,2))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(units=12, activation=''softmax'')) model.compile(optimizer=''adam'', loss=''categorical_crossentropy'') Marking scheme: ˆ Apply the following rules until the score is 0. – -1 for each of Pooling, Flatten, Dense, Compile lines if it is missing or has any wrong parameter – -1 for each of the wrong or missing parameters in a Conv2D layer. This in- cludes any parameter that is good by default but you break it by setting another absolutely wrong value. – -1 if the input shape is not specified in any manner – -1 for syntax errors like messed up brackets, the layers are not actually added to a model, etc. Small typos on keywords, etc. are not penalized (b) [4 points] Now instead of classification, a student wants to predict a single continuous real value from an input picture: how positive (or negative) the emotion is, ranging from 0 to 1. Is CNN in general able to do that? If yes, derive a model to complete such a task from part (a) model. Point out which parts you will change when building and compiling the model (if any) and explain how you will change it. (You don’t have to write codes, but state clearly your ideas). If no, also explain why. Answer: Yes, CNN is able to do that in general, and the following parts need to be changed. ˆ Change the Dense layer to units=1. ˆ Change the Dense layer activation to any other than softmax, e.g., relu or sigmoid. ˆ Change the loss to any regression loss, e.g., MSE, MAE. Marking scheme: ˆ 1 point for stating CNN is able to do that in general. ˆ 1 point for changing the output Dense unit to 1 ˆ 1 point for changing output activation to anything not related to probability and range containing [0,1] (sigmoid, linear, relu, etc.) ˆ 1 point for changing the loss function to any real value comparison (subtraction), e.g. Mean Squared Error, Mean Absolute Error, or other similar self-defined losses. ˆ -0.5 point for each change if the student doesn’t specify to what it changes or changes it to a wrong value. Appendix: Below are some Keras documentation for your reference. Some irrelevant parameters are omitted for conciseness. Sequential class tf.keras.Sequential(layers=None, name=None) Sequential groups a linear stack of layers into a tf.keras.Model ˆ add Method Sequential.add(layer) – layer: layer instance ˆ compile Method Model.compile(optimizer="rmsprop", loss=None) – optimizer: String (name of optimizer) or optimizer instance. – loss: Loss function. Conv2D class tf.keras.layers.Conv2D( filters, kernel_size, strides=(1, 1), padding="valid", activation=None, ) 2D convolution layer (e.g. spatial convolution over images). When using this layer as the first layer in a model, provide the keyword argument input shape (tuple of integers, does not include the sample axis), e.g. input shape=(128,128,3) for 128x128 RGB pictures in data format="channels last". ˆ filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). ˆ kernel size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. ˆ strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the height and width. Can be a single integer to specify the same value for all spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any dilation rate value != 1. ˆ padding: one of “valid” or “same” (case-insensitive). “valid” means no padding. “same” results in padding with zeros evenly to the left/right or up/down of the input. When padding="same" and strides=1, the output has the same size as the input. ˆ activation: Activation function to use. If you don’t specify anything, no activation is applied. MaxPooling2D class tf.keras.layers.MaxPooling2D( pool_size=(2, 2), strides=None, padding="valid", ) Max pooling operation for 2D spatial data. ˆ pool size: integer or tuple of 2 integers, window size over which to take the maximum. (2, 2) will take the max value over a 2x2 pooling window. If only one integer is specified, the same window length will be used for both dimensions. ˆ strides: Integer, tuple of 2 integers, or None. Strides values. Specifies how far the pooling window moves for each pooling step. If None, it will default to pool size. ˆ padding: One of “valid” or “same” (case-insensitive). “valid” means no padding. “same” results in padding evenly to the left/right or up/down of the input such that output has the same height/width dimension as the input. Flatten class tf.keras.layers.Flatten() Flattens the input. Dense class tf.keras.layers.Dense( units, activation=None, ) Regular densely-connected NN layer. ˆ units: Positive integer, dimensionality of the output space. ˆ activation: Activation function to use. If you don’t specify anything, no activation is applied. Common activation functions (in shorthand strings): “relu”, “sigmoid”, “softmax” Common loss functions (in shorthand strings): “categorical crossentropy”, “sparse categorical crossentropy”, “mean squared error” (same as “MSE”), “mean absolute error” (same as “MAE”) Common optimizer (in shorthand strings): “adam”', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'medium', '', '', ''), ('COMP2211-2022-spring-final-part-b', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [8 points] Minimax and Alpha-Beta Pruning Two players, MAX and MIN, are playing a game that can be represented by a tree, as shown below. (a) [3.5 points] Complete the following table by estimating the minimax value of each non- terminal node. A B C D E F G (b) [0.5 point] State the proper move of the maximizer by writing down one of the root’s outgoing edges (i.e. E-a or E-b). Note: The root node of the given tree is A. (c) [1.5 points] State whether minimax-based AI will choose to make a move which will result in a slower victory. Explain your answer. (d) [2.5 points] Suppose we now apply alpha-beta pruning on the game tree. Indicate the edge(s) that would be pruned (eliminated from consideration) by writing down the edge labels (i.e. E-a, E-b, E-c, . . ., E-n). You may assume that the branches are explored from left to right.', 8, 9, NULL::jsonb, NULL, NULL, 'Problem 3 [8 points] Minimax and Alpha-Beta Pruning Two players, MAX and MIN, are playing a game that can be represented by a tree, as shown below. (a) [3.5 points] Complete the following table by estimating the minimax value of each non- terminal node. Answer: A B C -1 D E F -1 G Marking scheme: ˆ 0.5 point for each correct answer. 3.5 points in total. (b) [0.5 point] State the proper move of the maximizer by writing down one of the root’s outgoing edges (i.e. E-a or E-b). Note: The root node of the given tree is A. Answer: E-a Marking scheme: ˆ 0.5 point for the correct answer. (c) [1.5 points] State whether minimax-based AI will choose to make a move which will result in a slower victory. Explain your answer. Answer: Yes, minimax-based AI may choose to make a move, resulting in a slower victory. Since we may have two moves with the same maximum minimax value, it picks the one in slower victory. Marking scheme: ˆ 1 point for stating minimax-based AI will choose to make a move which will result in a slower victory. ˆ 0.5 point for giving the proper explanation. (d) [2.5 points] Suppose we now apply alpha-beta pruning on the game tree. Indicate the edge(s) that would be pruned (eliminated from consideration) by writing down the edge labels (i.e. E-a, E-b, E-c, . . ., E-n). You may assume that the branches are explored from left to right. Answer: E-j and E-f would be pruned. Marking scheme: ˆ If E-j appears: +1 point. ˆ If E-f appears: +1.5 point. ˆ If E-m or E-n appears: +0 point. ˆ If other label appears: then d) become 0 points, no matter E-j and E-f appears or not.', ARRAY['Search and Games']::TEXT[], 'Search and Games', 'Search and Games', ARRAY['Search and Games']::TEXT[], ARRAY['tree_search', 'pruning', 'manual_tracing']::TEXT[], 'easy', '', '', ''), ('COMP2211-2022-spring-final-part-b', '4', NULL, 4, 'long_question', 'short_answer', 'Problem 4 [7 points] Ethics of Artificial Intelligence This question consists of five sub-questions, four of them are multiple-choice questions, and one is a short question. Choose the BEST ANSWER among the given choices for each multiple-choice question and put your answer in the given table, while for the short question, answer it in a few sentences. (a) [1 point] Ethics in artificial intelligence is (A) Something that is not an issue. (B) Something that somebody else will do in the future. (C) Something that we need to apply today. (D) Something that is entirely solved in current AI systems. (b) [1 point] One approach that helps developers avoid unintentionally creating bias in AI systems is (A) Using a wide variety of appropriately diverse data for training. (B) Using highly specific training data from a narrow range. (C) Not using any training data. (D) None of the above (c) [1 point] What are some of the ethical concerns around artificial intelligence? I. Racial, gender or other types of bias. II. Loss of jobs due to AI replacing workers performing repetitive tasks. III. Concern about the trustworthiness of decision-making supported by AI systems. IV. Privacy, for example, as human faces are photographed and recognized in public spaces. (A) I and II only (B) I, II, and IV only (C) All of the above (D) None of the above (d) [1 point] What is a significantly way in which developers of AI systems can guard against introducing bias? (A) Using only examples from their own environment as training data. (B) Providing effective training data and performing regular tests and audits. (C) Using less varied AI systems and datasets. (D) Using government approved algorithms. Question Answer (a) (b) (c) (d) (e) [3 points] State THREE ethical issues involved with the introduction of autonomous vehicles.', 7, 10, NULL::jsonb, NULL, NULL, 'Problem 4 [7 points] Ethics of Artificial Intelligence This question consists of five sub-questions, four of them are multiple-choice questions, and one is a short question. Choose the BEST ANSWER among the given choices for each multiple-choice question and put your answer in the given table, while for the short question, answer it in a few sentences. (a) [1 point] Ethics in artificial intelligence is (A) Something that is not an issue. (B) Something that somebody else will do in the future. (C) Something that we need to apply today. (D) Something that is entirely solved in current AI systems. (b) [1 point] One approach that helps developers avoid unintentionally creating bias in AI systems is (A) Using a wide variety of appropriately diverse data for training. (B) Using highly specific training data from a narrow range. (C) Not using any training data. (D) None of the above (c) [1 point] What are some of the ethical concerns around artificial intelligence? I. Racial, gender or other types of bias. II. Loss of jobs due to AI replacing workers performing repetitive tasks. III. Concern about the trustworthiness of decision-making supported by AI systems. IV. Privacy, for example, as human faces are photographed and recognized in public spaces. (A) I and II only (B) I, II, and IV only (C) All of the above (D) None of the above (d) [1 point] What is a significantly way in which developers of AI systems can guard against introducing bias? (A) Using only examples from their own environment as training data. (B) Providing effective training data and performing regular tests and audits. (C) Using less varied AI systems and datasets. (D) Using government approved algorithms. Question Answer (a) C (b) A (c) C (d) B Marking scheme: ˆ 1 point for each correct answer. 4 points in total. (e) [3 points] State THREE ethical issues involved with the introduction of autonomous vehicles. Answer: ˆ Who is to blame in an accident? ˆ In an emergency situation who should the car prioritize? ˆ Increase in use of cars is bad for the environment. ˆ Cost of the cars. Marking scheme: ˆ 1 point for giving each correct ethical issue. 3 points in total.', ARRAY['Ethics of AI']::TEXT[], 'Ethics of AI', 'Ethics of AI', ARRAY['Ethics of AI']::TEXT[], ARRAY['concept_explanation', 'argumentation', 'comparison']::TEXT[], 'easy', '', '', ''), ('COMP2211-2023-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions Indicate whether the following statements are true or false by circling T or F. You get 1 point for each correct answer. (a) T F Na¨ıve Bayes classifier is a probabilistic algorithm that computes the probability of an instance belonging to each class and selects the class with the highest probability as the output. (b) T F Na¨ıve Bayes classifier can be used for multi-class classification task. (c) T F K-Nearest Neighbors is a supervised learning and parametric algorithm that can be used to solve both classification and regression problems. (d) T F In K-Nearest Neighbors algorithm, the value of K should always be odd to avoid ties. (e) T F In D-fold cross validation, an increase of D will result in a longer time required to cross- validate the result. (f) T F After centroids initialization, K-Means Clustering is sensitive to the order in which the data points are processed, meaning that changing the order of the input data points may lead to different clustering results. (g) T F K-Median Clustering is robust to the presence of outliers and noise in the dataset, as it uses the median of the data points as the center of each cluster. Note: The median is the middle number in a sorted, ascending or descending list of numbers. (h) T F A perceptron with different initialization of weights and bias may result in different decision boundaries. (i) T F For perceptron, larger learning rates always lead to faster convergence. (j) T F Multilayer Perceptron with more layers are more expressive than Single Layer Perceptron regardless of the activation function is linear or not.', 10, 3, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions Indicate whether the following statements are true or false by circling T or F. You get 1 point for each correct answer. (a) T F Na¨ıve Bayes classifier is a probabilistic algorithm that computes the probability of an instance belonging to each class and selects the class with the highest probability as the output. (b) T F Na¨ıve Bayes classifier can be used for multi-class classification task. (c) T F K-Nearest Neighbors is a supervised learning and parametric algorithm that can be used to solve both classification and regression problems. (d) T F In K-Nearest Neighbors algorithm, the value of K should always be odd to avoid ties. (e) T F In D-fold cross validation, an increase of D will result in a longer time required to cross- validate the result. (f) T F After centroids initialization, K-Means Clustering is sensitive to the order in which the data points are processed, meaning that changing the order of the input data points may lead to different clustering results. (g) T F K-Median Clustering is robust to the presence of outliers and noise in the dataset, as it uses the median of the data points as the center of each cluster. Note: The median is the middle number in a sorted, ascending or descending list of numbers. (h) T F A perceptron with different initialization of weights and bias may result in different decision boundaries. (i) T F For perceptron, larger learning rates always lead to faster convergence. (j) T F Multilayer Perceptron with more layers are more expressive than Single Layer Perceptron regardless of the activation function is linear or not.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2023-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [21 points] Python Fundamentals (a) [9 points] Consider the following Numpy arrays: import numpy as np # np.arange(stop) # return an array of evenly spaced values within the half-open interval [0,stop), # the default step size is 1. A = np.arange(10) B = np.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]]) Write the output for each of the following Python statements. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) print(A[1:5]) (ii) print(A[1:5:2]) (iii) print(A[5::-2]) (iv) print(B[:2,1:]) (v) print(B[A[:1]]) (vi) print(B[A[1:]]) (vii) A[A%3 == 0] = 100 print(A) (viii) # np.mean(a, axis) # return the average of the array elements over the specified axis. print(np.mean(B, axis=-1)) (ix) # np.ndarray.transpose(*axis) # return a view of the array with axes transposed. print(B.transpose((1,0))) (b) [6 points] Write the output for the following Python code segments. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) import numpy as np A = np.array([1,2]) B = np.array([[1,2], [2,4], [3,6], [4,8]]) print(B/A) (ii) import numpy as np A = np.array([[0, 0, 0], [1, 2, 3], [4, 5, 6], [7, 8, 9]]) B = np.array([10, 11, 12, 13]) print(A + B) (iii) import numpy as np A = np.array([0, 10, 20, 30]) B = np.array([1, 2, 3]) # np.newaxis # increase the dimension of an array by adding new axis print(A[:, np.newaxis] + B) (c) [6 points] The distance function between two points x and y in the Poincare ball model can be calculated by the following formula: arccosh  1 + 2 ||x −y||2 (1 −||x||2)(1 −||y||2)  = arccosh  1 + 2 Pn i=1(xi −yi)2 (1 −Pn i=1 x2 i )(1 −Pn i=1 y2 i )  For example, if x = (0.0, 0.0), y = (0.0, 0.1), their Poincare distance is arccosh  1 + 2 (0.0 −0.0)2 + (0.0 −0.1)2 (1 −(0.02 + 0.02))(1 −(0.02 + 0.12))  ≈0.2006707 Given the following NumPy arrays, X and Y, where each 1-D array represents a data point: X = np.array([[0.0, 0.0], [0.1, 0.1], [0.2, 0.2]]) Y = np.array([[0, 0.1], [0.2, 0.3]]) Compute the Poincare distance between each data point in X and each data point in Y with a one-line Python expression, such that the result of the expression is [[0.2006707 0.75504766] [0.2027011 0.47971794] [0.46441642 0.22308802]] Note: ˆ An expression is a combination of values, variables, operators, and calls to functions. Your expression should work with any numbers of data points in X and Y and any number of values in the data points. ˆ You can assume that the number of attribute values in each data point are the same for both X and Y. ˆ There must be no explicit loops in your expression. You may find the following attribute or functions useful for this question. ˆ numpy.expand_dims(a, axis) Insert a new axis to a that will appear at the axis position in the expanded array shape. ˆ numpy.square(x) Return the element-wise square of the input x. ˆ numpy.sum(a, axis) Return the sum of array a’s elements over a given axis. ˆ a.T The transposed array of a. ˆ numpy.matmul(a, b) Return the matrix product of a and b. ˆ numpy.arccosh(x) Return the element-wise arccosh of the input x. Write your one-line Python expression below.', 21, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [21 points] Python Fundamentals (a) [9 points] Consider the following Numpy arrays: import numpy as np # np.arange(stop) # return an array of evenly spaced values within the half-open interval [0,stop), # the default step size is 1. A = np.arange(10) B = np.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]]) Write the output for each of the following Python statements. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) print(A[1:5]) Answer: [1 2 3 4] (ii) print(A[1:5:2]) Answer: [1 3] (iii) print(A[5::-2]) Answer: [5 3 1] (iv) print(B[:2,1:]) Answer: [[10 15] [25 30]] (v) print(B[A[:1]]) Answer: [[5 10 15]] (vi) print(B[A[1:]]) Answer: Error (vii) A[A%3 == 0] = 100 print(A) Answer: [100 1 2 100 4 5 100 7 8 100] (viii) # np.mean(a, axis) # return the average of the array elements over the specified axis. print(np.mean(B, axis=-1)) Answer: [10 25 40] (ix) # np.ndarray.transpose(*axis) # return a view of the array with axes transposed. print(B.transpose((1,0))) Answer: [[5 20 35] [10 25 40] [15 30 45]] Marking Scheme: ˆ 1 point for each sub-question. No partial point. The brackets must be correct in order to get the points. 9 points in total. (b) [6 points] Write the output for the following Python code segments. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) import numpy as np A = np.array([1,2]) B = np.array([[1,2], [2,4], [3,6], [4,8]]) print(B/A) Answer: [[1 1] [2 2] [3 3] [4 4]] (ii) import numpy as np A = np.array([[0, 0, 0], [1, 2, 3], [4, 5, 6], [7, 8, 9]]) B = np.array([10, 11, 12, 13]) print(A + B) Answer: Error (iii) import numpy as np A = np.array([0, 10, 20, 30]) B = np.array([1, 2, 3]) # np.newaxis # increase the dimension of an array by adding new axis print(A[:, np.newaxis] + B) Answer: [[1 2 3] [11 12 13] [21 22 23] [31 32 33]] Marking Scheme: ˆ 2 points for each sub-question. No partial point. The bracket must be correct in order to get the points. 6 points in total. (c) [6 points] The distance function between two points x and y in the Poincare ball model can be calculated by the following formula: arccosh  1 + 2 ||x −y||2 (1 −||x||2)(1 −||y||2)  = arccosh  1 + 2 Pn i=1(xi −yi)2 (1 −Pn i=1 x2 i )(1 −Pn i=1 y2 i )  For example, if x = (0.0, 0.0), y = (0.0, 0.1), their Poincare distance is arccosh  1 + 2 (0.0 −0.0)2 + (0.0 −0.1)2 (1 −(0.02 + 0.02))(1 −(0.02 + 0.12))  ≈0.2006707 Given the following NumPy arrays, X and Y, where each 1-D array represents a data point: X = np.array([[0.0, 0.0], [0.1, 0.1], [0.2, 0.2]]) Y = np.array([[0, 0.1], [0.2, 0.3]]) Compute the Poincare distance between each data point in X and each data point in Y with a one-line Python expression, such that the result of the expression is [[0.2006707 0.75504766] [0.2027011 0.47971794] [0.46441642 0.22308802]] Note: ˆ An expression is a combination of values, variables, operators, and calls to functions. Your expression should work with any numbers of data points in X and Y and any number of values in the data points. ˆ You can assume that the number of attribute values in each data point are the same for both X and Y. ˆ There must be no explicit loops in your expression. You may find the following attribute or functions useful for this question. ˆ numpy.expand_dims(a, axis) Insert a new axis to a that will appear at the axis position in the expanded array shape. ˆ numpy.square(x) Return the element-wise square of the input x. ˆ numpy.sum(a, axis) Return the sum of array a’s elements over a given axis. ˆ a.T The transposed array of a. ˆ numpy.matmul(a, b) Return the matrix product of a and b. ˆ numpy.arccosh(x) Return the element-wise arccosh of the input x. Write your one-line Python expression below. Answer: print(np.arccosh(1 + 2 * np.sum((np.expand_dims(X, axis=1) - Y) ** 2, axis=2) / np.matmul(np.expand_dims(1 - np.sum(X ** 2, axis=1), axis=1), np.expand_dims(1 - np.sum(Y ** 2, axis=1), axis=1).T))) Marking Scheme: ˆ 1.5 points: correct usage of np.expand dims(), 0.5 points each. The axis must be correct in order to get the points. Can be replaced by np.newaxis or other equivalent functions. ˆ 1.5 points: correct usage of np.sum(), 0.5 points each. The axis must be correct in order to get the points. ˆ 1.5 points: Ccorrect usage of np.square(), 0.5 points each. Can be replaced by **2. ˆ 0.5 points: correct usage of np.matmul(). The second argument should have the transpose (np.transpose() or T.). But if the second no.expand dims() in the denom- inator is expanded in axis = 0, the transpose should not occur. Can be replaced by np.dot() or @. ˆ 0.5 points: correct usage of np.arccosh(). ˆ 0.5 points: correct basic formula as arccosh(1 + 2*(nominator/denominator)). Gen- erally the students can get this 0.5 points if they write their answers.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''), ('COMP2211-2023-spring-midterm', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [8 points] Na¨ıve Bayes Classifier Given the following training dataset Humidity Wind Temperature Play Tennis Day 1 High Weak Mild Yes Day 2 ? Weak Cool Yes Day 3 Normal Strong Cool No Day 4 Normal Strong ? Yes Day 5 High Weak Hot No Suppose we want to classify whether we will play tennis on Day 6 with the following attributes: Humidity=High, Wind=Weak, Temperature=? using Na¨ıve Bayes Classifier with the given training dataset. Unfortunately, some of the training data is missing, which is denoted as “?”. Can Na¨ıve Bayes Classifier handle this dataset with missing data? i.e., Can you still yield a classifica- tion output of whether you will play tennis on Day 6? If not, explain why. Otherwise, show how you will approach this problem. In your answer, you should clearly mention why your explanation is valid with reference to the property of the Na¨ıve Bayes algorithm.', 8, 9, NULL::jsonb, NULL, NULL, 'Problem 3 [8 points] Na¨ıve Bayes Classifier Given the following training dataset Humidity Wind Temperature Play Tennis Day 1 High Weak Mild Yes Day 2 ? Weak Cool Yes Day 3 Normal Strong Cool No Day 4 Normal Strong ? Yes Day 5 High Weak Hot No Suppose we want to classify whether we will play tennis on Day 6 with the following attributes: Humidity=High, Wind=Weak, Temperature=? using Na¨ıve Bayes Classifier with the given training dataset. Unfortunately, some of the training data is missing, which is denoted as “?”. Can Na¨ıve Bayes Classifier handle this dataset with missing data? i.e., Can you still yield a classifica- tion output of whether you will play tennis on Day 6? If not, explain why. Otherwise, show how you will approach this problem. In your answer, you should clearly mention why your explanation is valid with reference to the property of the Na¨ıve Bayes algorithm. Answer: Na¨ıve Bayes can handle a dataset with missing value. Attributes are handled separately by the algorithm at both model construction time and prediction time. Therefore, the miss- ing attributes can simply be ignored while preparing the model, and also ignored when a probability is calculated for a class value. Marking Scheme: ˆ Case 1: Student’s answer is a clear “Yes”, or suggests that the Na¨ıve Bayes can be used in this scenario. – 2 points for indicating/suggesting that Na¨ıve Bayes can be used in this scenario. – 3 points for mentioning the property of the Na¨ıve Bayes, where attributes are handled separately and/or independently. – 3 points for mentioning the data entry with the missing value in the table can simply be ignored. However, answers that suggest ignoring the data for the entire day (row), or the data for the entire attribute (column) are not acceptable. Answer should also include that the attribute “Temperature” for Day 6 can be ignored for when calculating the classification result. ˆ Case 2: Student’s answer is a clear “No”. – 0 point, and partial credit is NOT given for the subsequent explanation. Because, whatever explanation is for the wrong claim.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'easy', '', '', ''), ('COMP2211-2023-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [14 points] K-Nearest Neighbors Suppose you are building a K-Nearest Neighbors classifier that predicts user preference on movie genre based on their rating history (a 5-point scale). User Movie Ratings Preferred Genre Movie1 Movie2 Movie3 Alice Romance Bob Romance Charlie Action Table 1: Training Dataset User Movie Ratings Preferred Genre Movie1 Movie2 Movie3 David ? Table 2: Test Dataset (a) [8 points] Based on the movie ratings in Table 1 and Table 2, calculate the Cosine and Euclidean distance between ratings of each training data and test data (round to the third decimal place). Fill in the distance columns in the following table, and determine the class of David for each distance when K=1. You can find the following approxima- tions and equations helpful for this question. Approximated values: √ 5 ≈2.236 √ 12 ≈3.464 √ 14 ≈3.742 √ √ 22 = √ 242 ≈15.556 √ √ 66 = √ 1452 ≈38.105 Cosine distance formula: cosθ = Pn i=1 Xtrain i × Xtest i qPn i=1(Xtrain i )2  pPn i=1(Xtest i )2  Cosine Distance = 1 −cosθ where Xtrain i and Xtest i are the feature value of training data and test data, respectively, and n is the number of features. Euclidean distance formula: Euclidean Distance = v u u t n X i=1 (Xtrain i −Xtest i )2 where Xtrain i and Xtest i are the feature value of training data and test data, respectively, and n is the number of features. User Movie Ratings Movie1 Movie2 Movie3 Preferred Genre Cosine Distance Euclidean Distance Alice Romance Bob Romance Charlie Action David ? Class for cosine distance: Class for Euclidean distance: (b) [6 points] Let’s say that Alice tends to rate movies highly, while David tends to rate them relatively poorly. Given this scenario, which distance metric is more appropriate? Also, describe why it makes the classifier better.', 14, 10, NULL::jsonb, NULL, NULL, 'Problem 4 [14 points] K-Nearest Neighbors Suppose you are building a K-Nearest Neighbors classifier that predicts user preference on movie genre based on their rating history (a 5-point scale). User Movie Ratings Preferred Genre Movie1 Movie2 Movie3 Alice Romance Bob Romance Charlie Action Table 1: Training Dataset User Movie Ratings Preferred Genre Movie1 Movie2 Movie3 David ? Table 2: Test Dataset (a) [8 points] Based on the movie ratings in Table 1 and Table 2, calculate the Cosine and Euclidean distance between ratings of each training data and test data (round to the third decimal place). Fill in the distance columns in the following table, and determine the class of David for each distance when K=1. You can find the following approxima- tions and equations helpful for this question. Approximated values: √ 5 ≈2.236 √ 12 ≈3.464 √ 14 ≈3.742 √ √ 22 = √ 242 ≈15.556 √ √ 66 = √ 1452 ≈38.105 Cosine distance formula: cosθ = Pn i=1 Xtrain i × Xtest i qPn i=1(Xtrain i )2  pPn i=1(Xtest i )2  Cosine Distance = 1 −cosθ where Xtrain i and Xtest i are the feature value of training data and test data, respectively, and n is the number of features. Euclidean distance formula: Euclidean Distance = v u u t n X i=1 (Xtrain i −Xtest i )2 where Xtrain i and Xtest i are the feature value of training data and test data, respectively, and n is the number of features. User Movie Ratings Movie1 Movie2 Movie3 Preferred Genre Cosine Distance Euclidean Distance Alice Romance Bob Romance Charlie Action David ? Class for cosine distance: Class for Euclidean distance: User Movie Ratings Movie1 Movie2 Movie3 Preferred Genre Cosine Distance Euclidean Distance Alice Romance 0.003 3.464 Bob Romance 0.029 3.741 or 3.742 Charlie Action 0.100 2.236 David ? Class for cosine distance: Romance Class for Euclidean distance: Action Marking Scheme: ˆ 1 point for giving each correct value. 6 points in total. – 0.5 point for each correct value that is not rounded to the 3rd decimal place. – No point for others (e.g., square root values or equations). ˆ 1 point for giving each correct class. 2 points in total. (b) [6 points] Let’s say that Alice tends to rate movies highly, while David tends to rate them relatively poorly. Given this scenario, which distance metric is more appropriate? Also, describe why it makes the classifier better. Answer: In this scenario, the Cosine distance metric performs better, indicating that the ground truth class is “Romance”. The reason for this is that the Cosine distance metric con- siders the direction of the vectors being compared, while the Euclidean distance metric takes both direction and magnitude into account. Since the Cosine distance metric is less sensitive to variations in magnitude, it is better suited for comparing vectors with different scales. Given that this data contains bias in terms of scale, the Cosine distance metric is a better choice in this case. Marking Scheme: ˆ 1 point for stating Cosine distance metric performs better, indicating that the ground truth class is “Romance”. ˆ 2 points for stating Cosine distance metric considers the direction, while Euclidean distance metric takes both direction and magnitude into account. – Note that direction or angle should be mentioned in the state of Cosine dis- tance metric, while magnitude should be mentioned in the state of Euclidean distance metric. ˆ 3 points for stating Cosine distance metric is less sensitive to variations in mag- nitude, and the data contain bias in terms of scale, Cosine distance is a better choice. – 1 point for stating that Cosine distance metric is less sensitive to variations. – 1 point for stating that Cosine distance metric is less sensitive to the biased data which is described in the problem.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'medium', '', '', ''), ('COMP2211-2023-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [19 points] K-Means Clustering Consider a 2-dimensional dataset with the following 5 data points: A(1, 4), B(2, 3), C(4, 6), D(5, 7), E(8, 3) Perform K-Means clustering on this dataset with K = 2. Use the following initial centroids for your calculations: Centroid 1: A(1, 4), Centroid 2: D(5, 7) If a tie occurs, assign the points to Centroid 1. All calculations round to two decimal places. (a) [7 points] Calculate the Euclidean distances between each data point and both centroids. Fill in the distances in the table. Then, assign each point to the nearest centroid. Distance A B C D E Centroid 1 Centroid 2 Data points assigned to Centroid 1: Data points assigned to Centroid 2: (b) [7 points] Recalculate the centroids using the mean of the points assigned to each cen- troid. Fill in the values of new centroids after C1 and C2 in the table. Then repeat the process of assigning points and recalculating centroids until convergence. You may not need all the provided table templates. Leave them blank if the algorithm has already converged. Report the final cluster assignments and centroids. Distance A B C D E C1( , ) C2( , ) Data points assigned to Centroid 1: Data points assigned to Centroid 2: Distance A B C D E C1( , ) C2( , ) Data points assigned to Centroid 1: Data points assigned to Centroid 2: Distance A B C D E C1( , ) C2( , ) Data points assigned to Centroid 1: Data points assigned to Centroid 2: (c) [5 points] The choice of the number of clusters K in K-Means clustering can significantly impact the clustering results. Selecting too few or too many clusters can lead to overgen- eralization or overfitting. Why this limitation is critical for K-Means clustering? Given a dataset with an unknown number of clusters, please explain one way to determine a suitable K.', 19, 12, NULL::jsonb, NULL, NULL, 'Problem 5 [19 points] K-Means Clustering Consider a 2-dimensional dataset with the following 5 data points: A(1, 4), B(2, 3), C(4, 6), D(5, 7), E(8, 3) Perform K-Means clustering on this dataset with K = 2. Use the following initial centroids for your calculations: Centroid 1: A(1, 4), Centroid 2: D(5, 7) If a tie occurs, assign the points to Centroid 1. All calculations round to two decimal places. (a) [7 points] Calculate the Euclidean distances between each data point and both centroids. Fill in the distances in the table. Then, assign each point to the nearest centroid. Distance A B C D E Centroid 1 Centroid 2 Data points assigned to Centroid 1: Data points assigned to Centroid 2: Answers: Distance A B C D E Centroid 1 1.41 3.61 5.00 7.07 Centroid 2 5.00 5.00 1.41 5.00 Data points assigned to Centroid 1: A, B Data points assigned to Centroid 2: C, D, E Marking Scheme: ˆ 0.5 point for each correct value. 5 points in total. ˆ 1 point for giving the data points assigned to each centroid. 2 points in total. ˆ -0.5 points for not rounding to two decimal places. (b) [7 points] Recalculate the centroids using the mean of the points assigned to each cen- troid. Fill in the values of new centroids after C1 and C2 in the table. Then repeat the process of assigning points and recalculating centroids until convergence. You may not need all the provided table templates. Leave them blank if the algorithm has already converged. Report the final cluster assignments and centroids. Distance A B C D E C1( , ) C2( , ) Data points assigned to Centroid 1: Data points assigned to Centroid 2: Distance A B C D E C1( , ) C2( , ) Data points assigned to Centroid 1: Data points assigned to Centroid 2: Distance A B C D E C1( , ) C2( , ) Data points assigned to Centroid 1: Data points assigned to Centroid 2: Answer: Distance A B C D E C1(1.5, 3.5) 0.71 0.71 3.54 4.95 6.52 C2(5.67, 5.33) 4.86 4.35 1.80 1.80 3.30 Data points assigned to Centroid 1: A, B Data points assigned to Centroid 2: C, D, E Marking Scheme: ˆ 0.5 points for each correct value. 5 points in total. ˆ 1 point for giving the data points assigned to each centroid. 2 points in total. ˆ -0.5 points for not rounding to two decimal places. ˆ -1 point for filling in/copying more than 1 table template (incorrect convergence statement). (c) [5 points] The choice of the number of clusters K in K-Means clustering can significantly impact the clustering results. Selecting too few or too many clusters can lead to overgen- eralization or overfitting. Why this limitation is critical for K-Means clustering? Given a dataset with an unknown number of clusters, please explain one way to determine a suitable K. Answer: Clustering is an unsupervised learning method. In the scenario where you try to adopt K- Means clustering or any other unsupervised models, you don’t know anything about the shape, the number of clusters, or the distribution of the dataset. Such value k is exactly what you are looking for and will never be given beforehand. One common method to determine the optimal number of clusters is the elbow method, which identifies the point where adding more clusters does not improve the clustering performance significantly. However, this method can be subjective and may not always yield the optimal number of clusters. Other methods, such as silhouette analysis or gap statistics, can also be used to determine the optimal number of clusters. Marking Scheme: ˆ 2 points for stating that K-Means is an unsupervised method and the best value K will never be given beforehand. ˆ 3 points for giving and explaining a way to determine a suitable K. ˆ -1 point for stating why the limitation is critical for K-Means clustering but the answer is minor. ˆ -1 point for paraphrasing the prompt. ˆ -1 point for using only SSE to evaluate the model and use the K at minimum SSE.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''), ('COMP2211-2023-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [12 points] Perceptron Given the following training dataset and test dataset: Training Test x1 -2 -1 x2 -2 -1 T -1 -1 -1 -1 (a) [2 points] Suppose we use the training dataset to train a perceptron. At some point during the training, the weights and bias of the perceptron have not changed after four consecutive updates, but the current epoch is not yet finished. Should we stop the training? Explain why. (b) [2 points] After the training has converged, what is the range of possible accuracy on the test dataset? A. 50% B. 50% ∼75% C. 50% ∼100% D. 25% ∼75% E. 0% ∼100% (c) [2 points] Consider the following implementation of a perceptron: import numpy as np def fit(X, T, W0, b0, eta, max_epochs): W = np.ones(X.shape[0]) * W0 b = b0 for epoch in range(max_epochs): for i in range(T.shape[0]): O = predict(X, W, b) E = T[i] - O W = W + E * X b = b + E return W, b def predict(X, W, b): Z = np.dot(X, W) + b O = (Z <= 0) * 2 - 1 return O The predict function takes three inputs: ˆ A 1-D or 2-D NumPy array, X, storing test example(s). The shape of X is (d,) or (n, d) where n is the number of test examples and d is the number of features; ˆ A 1-D NumPy array, W, storing the weights of the perceptron. The shape of W is (d,) where d is the number features; ˆ A bias value, b, for the perceptron. Based on how the predict function is implemented, what is the activation function f(z) of this perceptron? (d) [6 points] The fit function takes six inputs: ˆ A 2-D NumPy array, X, storing training examples. The shape of X is (m, d) where m is the number of training examples and d is the number of features; ˆ A 1-D NumPy array, T, storing training targets. The shape of T is (m,) where m is the number training examples; ˆ An initial value, W0, for the weights of the perceptron; ˆ An initial value, b0, for the bias of the perceptron; ˆ The learning rate, eta; ˆ The maximum number of training epochs, max epochs. Given these inputs, however, the fit function is not implemented correctly. Identify all the errors by giving the line numbers that cause the errors, and propose ways to fix them. You may find the following attribute or functions useful for this question. ˆ numpy.ones(shape) Return a new array of given shape, shape, filled with ones. ˆ range(stop) Return a sequence of numbers starting from 0, incrementing by 1, and ending at the value stop. ˆ ndarray.shape Tuple of array dimensions. ˆ numpy.dot(a, b) Return the dot product of a and b if a and b are both 1-D arrays. If a is an N-D array and b is a 1-D array, it is a sum product over the last axis of a and b is returned.', 12, 15, NULL::jsonb, NULL, NULL, 'Problem 6 [12 points] Perceptron Given the following training dataset and test dataset: Training Test x1 -2 -1 x2 -2 -1 T -1 -1 -1 -1 (a) [2 points] Suppose we use the training dataset to train a perceptron. At some point during the training, the weights and bias of the perceptron have not changed after four consecutive updates, but the current epoch is not yet finished. Should we stop the train- ing? Explain why. Answer: Yes, the training has converged so we should stop it from wasting time and resources. Marking Scheme: ˆ 1 point for stating we should stop the training. ˆ 1 point for giving correct explanations. (b) [2 points] After the training has converged, what is the range of possible accuracy on the test dataset? A. 50% B. 50% ∼75% C. 50% ∼100% D. 25% ∼75% E. 0% ∼100% Answer: D Marking Scheme: ˆ 2 points for the correct answer. (c) [2 points] Consider the following implementation of a perceptron: import numpy as np def fit(X, T, W0, b0, eta, max_epochs): W = np.ones(X.shape[0]) * W0 b = b0 for epoch in range(max_epochs): for i in range(T.shape[0]): O = predict(X, W, b) E = T[i] - O W = W + E * X b = b + E return W, b def predict(X, W, b): Z = np.dot(X, W) + b O = (Z <= 0) * 2 - 1 return O The predict function takes three inputs: ˆ A 1-D or 2-D NumPy array, X, storing test example(s). The shape of X is (d,) or (n, d) where n is the number of test examples and d is the number of features; ˆ A 1-D NumPy array, W, storing the weights of the perceptron. The shape of W is (d,) where d is the number features; ˆ A bias value, b, for the perceptron. Based on how the predict function is implemented, what is the activation function f(z) of this perceptron? Answer: f(z) =    if z ≤0 −1 otherwise Marking Scheme: ˆ 2 points for giving the correct activation function. (d) [6 points] The fit function takes six inputs: ˆ A 2-D NumPy array, X, storing training examples. The shape of X is (m, d) where m is the number of training examples and d is the number of features; ˆ A 1-D NumPy array, T, storing training targets. The shape of T is (m,) where m is the number training examples; ˆ An initial value, W0, for the weights of the perceptron; ˆ An initial value, b0, for the bias of the perceptron; ˆ The learning rate, eta; ˆ The maximum number of training epochs, max epochs. Given these inputs, however, the fit function is not implemented correctly. Identify all the errors by giving the line numbers that cause the errors, and propose ways to fix them. You may find the following attribute or functions useful for this question. ˆ numpy.ones(shape) Return a new array of given shape, shape, filled with ones. ˆ range(stop) Return a sequence of numbers starting from 0, incrementing by 1, and ending at the value stop. ˆ ndarray.shape Tuple of array dimensions. ˆ numpy.dot(a, b) Return the dot product of a and b if a and b are both 1-D arrays. If a is an N-D array and b is a 1-D array, it is a sum product over the last axis of a and b is returned. Answer: ˆ On line 4, change X.shape[0] into X.shape[1] ˆ On line 8 and 10, change X into X[i] ˆ On line 9, multiply the right-hand side with eta ˆ On line 9, change T[i]-O into O-T[i], or equivalently, on line 10 and 11, change + into -. The resulting fit function is as follows: def fit(X, T, W0, b0, eta, max_epochs): W = np.ones(X.shape[1]) * W0 b = b0 for epoch in range(max_epochs): for i in range(T.shape[0]): O = predict(X[i], W, b) E = eta * (O - T[i]) W = W + E * X[i] b = b + E return W, b Other equivalent solutions also exist. Marking Scheme: ˆ 2 points for each error; 1 point for identifying the error (pointing out only the line number does not count), and 1 point for correctly fixing the error. Partial points are given to incomplete fixes of an error. ˆ For fixes that correctly fixed an error but accidentally introduced other errors, points are partially deducted. No extra points are deducted for purely irrelevant/erroneous modifications of the code. ˆ The final mark is capped at 6 points.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''), ('COMP2211-2023-spring-midterm', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [16 points] Multilayer Perceptron (a) [4 points] Consider the following dataset x1 x2 F(x1,x2) Given the following single layer perceptron, where activation function f is defined as f(z) = ( 1, for z ≥0 0, otherwise Can we classify all the data points correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly explain the reason. (b) [2 points] Consider the following dataset x1 x2 F(x1,x2) Using the same perceptron architecture in part (a), can we classify all the data points correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly explain the reason. (c) [10 points] Consider the same dataset in part (b), given the following multilayer percep- tron, where activation function f is defined as identical with that in part (a), can we classify all the data points correctly using this multilayer perceptron? If yes, give a set of possible parameters. If no, briefly explain the reason. Hint: You don’t need to use gradient descent to calculate the results. You may try to find a set of parameters to make the data points linear separable after the first layer mapping. -------------------- END OF PAPER --------------------', 16, 18, NULL::jsonb, NULL, NULL, 'Problem 7 [16 points] Multilayer Perceptron (a) [4 points] Consider the following dataset x1 x2 F(x1,x2) Given the following single layer perceptron, where activation function f is defined as f(z) = ( 1, for z ≥0 0, otherwise Can we classify all the data points correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly explain the reason. Answer: Yes, the four data points are linearly separable. One possible set of parameters is w1 = 1, w2 = 1, θ = −1.5. Marking Scheme: ˆ 1 point for ‘yes’ answer. ˆ 3 points for correct parameter (no partial points). (b) [2 points] Consider the following dataset Using the same perceptron architecture in part (a), can we classify all the data points correctly using this perceptron? If yes, give a set of possible parameters. If no, briefly x1 x2 F(x1,x2) explain the reason. Answer: No, the 4 data points are not linearly separable, so they can’t be fit with 100% accuracy using single layer perceptron. Marking Scheme: ˆ 1 point for ‘yes’ answer. ˆ 3 points for correct parameters (no partial points). (c) [10 points] Consider the same dataset in part (b), given the following multilayer percep- tron, where activation function f is defined as identical with that in part (a), can we classify all the data points correctly using this multilayer perceptron? If yes, give a set of possible parameters. If no, briefly explain the reason. Hint: You don’t need to use gradient descent to calculate the results. You may try to find a set of parameters to make the data points linear separable after the first layer mapping. Answer: Yes, one possible set of parameters can be obtained as following procedure: we first use two linear functions to seperate the 4 datatpoints, e.g, w1 = −1, w2 = 1, θ1 = −0.5, w3 = −1, w4 = 1, θ2 = 0.5, the related diagram is as followed, (1,1) (0,1) (x1) (x2) (1,0) (0,0) 𝑤1𝑥1 + 𝑤2𝑥2 + 𝜃1 𝑤3𝑥1 + 𝑤4𝑥2 + 𝜃2 After first layer mapping, the result will be x1 x2 y1 = w1x1 + w2x2 + θ1 y2 = w3x1 + w4x2 + θ2 f(y1) f(y2) F(x1, x2) -0.5 0.5 0.5 1.5 -1 -0.5 -0.5 0.5 For last three columns of the table, we can see the datapoints are linear separable after the first layer, where we can set w5 = −1, w6 = 1, θ3 = −0.5 to fulfill the requirement. To sum up, one possible set of parameters are w1 = −1, w2 = 1, θ1 = −0.5, w3 = −1, w4 = 1, θ2 = 0.5, w5 = −1, w6 = 1, θ3 = −0.5 Marking Scheme: ˆ 1 point for ‘yes’ answer. ˆ 6 points if first layer mapping makes all data points linear separable (3 points if the data points are ‘almost’ linear separable, e.g., one set of parameters are correct except that the signs are opposite). ˆ 3 points for correct second layer parameters. -------------------- END OF PAPER --------------------', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'hard', '', '', ''), ('COMP2211-2024-spring-midterm', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [5 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 0.5 point for each correct answer. Question (a) (b) (c) (d) (e) (f) (g) (h) (i) (j) Answer (a) In the following code import numpy as np a = np.array([0, 1, 1, 2, 5, 5, 3]) b = np.array([0, 1, 2, 3, 4, 5]) c = (b == a.reshape(7, 1)) The array c has the shape (7, 1). (b) After executing the following block of code: import numpy as np a = np.array([[1, 2], [3, 4], [5, 6]]) b = np.array([[1, 2, 3], [0, 0, 0], [1, 0, 0]]) c = a.dot(b) The array c is array([[22, 28], [0, 0], [1, 2]]). (c) The Na¨ıve Bayes Classifier operates under the assumption that the presence of a partic- ular feature in a class is independent of the presence of any other feature. (d) In Na¨ıve Bayes, we assume that P(B|e1, e2) = P(B|e1)P(B|e2) where B is our belief and (e1, e2) are evidence. (e) In Na¨ıve Bayes, given P(B = b), P(e1|B = b), and P(e2|B = b) for each possible belief b, we can compute P(B = b′|e1, e2) for any b′. (f) K-Nearest Neighbors Classifier CANNOT handle data with categorical features since it is difficult to find the distance between categorical features. (g) In K-Nearest Neighbors for binary classification, odd values of k are usually preferred. (h) A 6-fold cross validation for K-nearest neighbors algorithm means that for each value of K, we randomly select 1/6 of the training data as the validation set to evaluate the model which is trained by the remaining (5/6) of the training data. (i) The result of the K-Means Clustering DOES NOT depend on the initial centroids. (j) It is possible that after new cluster centroids are computed by the K-Means Clustering Algorithm, a cluster centroid may be associated with an empty cluster (i.e., with zero points in it).', 5, 3, NULL::jsonb, NULL, NULL, 'Problem 1 [5 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 0.5 point for each correct answer. Question (a) (b) (c) (d) (e) (f) (g) (h) (i) (j) Answer (a) In the following code import numpy as np a = np.array([0, 1, 1, 2, 5, 5, 3]) b = np.array([0, 1, 2, 3, 4, 5]) c = (b == a.reshape(7, 1)) The array c has the shape (7, 1). (b) After executing the following block of code: import numpy as np a = np.array([[1, 2], [3, 4], [5, 6]]) b = np.array([[1, 2, 3], [0, 0, 0], [1, 0, 0]]) c = a.dot(b) The array c is array([[22, 28], [0, 0], [1, 2]]). (c) The Na¨ıve Bayes Classifier operates under the assumption that the presence of a partic- ular feature in a class is independent of the presence of any other feature. (d) In Na¨ıve Bayes, we assume that P(B|e1, e2) = P(B|e1)P(B|e2) where B is our belief and (e1, e2) are evidence. (e) In Na¨ıve Bayes, given P(B = b), P(e1|B = b), and P(e2|B = b) for each possible belief b, we can compute P(B = b′|e1, e2) for any b′. (f) K-Nearest Neighbors Classifier CANNOT handle data with categorical features since it is difficult to find the distance between categorical features. (g) In K-Nearest Neighbors for binary classification, odd values of k are usually preferred. (h) A 6-fold cross validation for K-nearest neighbors algorithm means that for each value of K, we randomly select 1/6 of the training data as the validation set to evaluate the model which is trained by the remaining (5/6) of the training data. (i) The result of the K-Means Clustering DOES NOT depend on the initial centroids. (j) It is possible that after new cluster centroids are computed by the K-Means Clustering Algorithm, a cluster centroid may be associated with an empty cluster (i.e., with zero points in it). Question (a) (b) (c) (d) (e) (f) (g) (h) (i) (j) Answer F F T F T F T F F T', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'easy', '', '', ''), ('COMP2211-2024-spring-midterm', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [19 points] Advanced Python for Artificial Intelligence (a) [13 points] Consider the following NumPy arrays: import numpy as np # np.arange(start, stop) # - return an array of evenly spaced values within the half-open interval # [start,stop), the default step size is 1. # np.ones(shape): # - return a new array of given shape, filled with ones. A = np.arange(5,15) B = np.ones((3,2)) C = np.array([[[0,1,2,3], [4,5,6,7]], [[8,9,10,11], [12,13,14,15]], [[16,17,18,19], [20,21,22,23]]]) Suppose the following Python statements are running consecutively. Write the output for each of the following Python statements. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) print(A[2:-2:3]) (ii) # np.ndarray.reshape(shape) # - return an array containing the same data with a new shape. print(B.reshape(3,-1,2)) (iii) Please write a line of Python code to get the same result in part (a)(ii) by using np.expand dims on B. # np.expand_dims(a, axis) # - insert a new axis to a that will appear at the axis position in the # expanded array shape. Return an array that is the view of a with the # number of dimensions increased. (iv) Please write a line of Python code to create the array C by using the functions np.ndarray.reshape and np.arange(). (v) # np.mean(a, axis) # - return a new array containing the average values of a over the specified axis. print(np.mean(C,axis = 2)) (vi) # np.transpose(a, axes) # - return an array with axes transposed in a. print(np.transpose(C, (1, 0, 2))) (vii) # a@b # - return the matrix multiplication of the two arrays a and b. D = A.reshape(2, 5) print(B@D) (viii) # np.dot(a,b) # - return the dot product of the two arrays a and b. If both a and b are # 1-D arrays, it is the inner product of vectors and returns a scalar. # If both a and b are bool arrays, the output is in bool datatype. # np.ndarray.astype(dtype) # - return the copy of the array with a specified dtype. E = A < 12 F = A % 5 == 0 print(np.dot(E, F).astype(int)) (ix) print(np.dot(E, F.astype(int))) (x) print( C / B ) (xi) # np.newaxis # - increase the dimension of an array by adding new axis. print(C / B[..., np.newaxis]) (xii) G = C[0, :, 2:] print(G) (xiii) C[0, 1, 2] = 100 print(G) Scheme: ˆ 1 point for giving the correct answer for each part. 13 points in total. (b) [6 points] In recommendation systems, we often recommend items similar to what the user likes. This is known as content-based filtering. In content-based filtering, an item is represented as a feature vector (or a 1D array). Given the following code which computes the cosine similarity between feature vectors of items in explicit loops: def compute_cosine_similarity_loops(X): num_items, num_features = X.shape # --- BLOCK TO REWRITE --- X_normalized = np.zeros((num_items, num_features)) for i in range(num_items): feature_norm = np.sqrt(np.sum(X[i] ** 2)) X_normalized[i] = X[i] / feature_norm similarities = np.zeros((num_items, num_items)) for i in range(num_items): for j in range(num_items): similarities[i, j] = np.sum(X_normalized[i] * X_normalized[j]) # --- BLOCK TO REWRITE --- return similarities X = np.array([[0, 2], [1, -1], [1, 1]]) print(compute_cosine_similarity_loops(X)) # Output: # [[ 1. -0.70710678 0.70710678] # [-0.70710678 1. 0. ] # [ 0.70710678 0. 1. ]] Rewrite the block of code between the comment lines “# --- BLOCK TO REWRITE --- ” using no explicit loops in the space provided. You may find the following functions useful for this question. ˆ Element-wise square of an array: np.square(A) – A is the input array This is equivalent to A ** 2. ˆ Element-wise square root of an array: np.sqrt(A) – A is the input array ˆ Sum of array elements over a given axis: np.sum(A, axis) – A is the input array – axis is the axis across which the array is summed ˆ Insert a new axis of size 1 to an array: np.expand_dims(A, axis) – A is the input array – axis is the position where the axis is to be inserted If axis is 0, this is equivalent to A[np.newaxis] and A[None]. If axis is 1, this is equivalent to A[:, np.newaxis] and A[:, None]. ˆ Transpose of an array: np.transpose(A) – A is the input array This is equivalent to A.T. ˆ Matrix multiplication: np.matmul(A, B) – A is the left array for matrix multiplication – B is the right array for matrix multiplication This is equivalent to A @ B. More information on matrix multiplication: suppose A.shape[1] == B.shape[0], then C = np.matmul(A, B) means that for each i and j, C[i, j] == np.sum(A[i] * B[:, j]) Write your code in the space below.', 19, 4, NULL::jsonb, NULL, NULL, 'Problem 2 [19 points] Advanced Python for Artificial Intelligence (a) [13 points] Consider the following NumPy arrays: import numpy as np # np.arange(start, stop) # - return an array of evenly spaced values within the half-open interval # [start,stop), the default step size is 1. # np.ones(shape): # - return a new array of given shape, filled with ones. A = np.arange(5,15) B = np.ones((3,2)) C = np.array([[[0,1,2,3], [4,5,6,7]], [[8,9,10,11], [12,13,14,15]], [[16,17,18,19], [20,21,22,23]]]) Suppose the following Python statements are running consecutively. Write the output for each of the following Python statements. If the output is an empty array, write “Empty Array”. If an error occurs, write “Error”. (i) print(A[2:-2:3]) Answer: [7 10] (ii) # np.ndarray.reshape(shape) # - return an array containing the same data with a new shape. print(B.reshape(3,-1,2)) Answer: [[[1 1]] [[1 1]] [[1 1]]] (iii) Please write a line of Python code to get the same result in part (a)(ii) by using np.expand dims on B. # np.expand_dims(a, axis) # - insert a new axis to a that will appear at the axis position in the # expanded array shape. Return an array that is the view of a with the # number of dimensions increased. Answer: np.expand dims(B, 1) (also correct if adding print()) (iv) Please write a line of Python code to create the array C by using the functions np.ndarray.reshape and np.arange(). Answer: np.arange(24).reshape(3, 2, 4) (also correct if adding C = ) (v) # np.mean(a, axis) # - return a new array containing the average values of a over the specified axis. print(np.mean(C,axis = 2)) Answer: [[1.5 5.5] [9.5 13.5] [17.5 21.5]] (vi) # np.transpose(a, axes) # - return an array with axes transposed in a. print(np.transpose(C, (1, 0, 2))) Answer: [[[0 1 2 3] [8 9 10 11] [16 17 18 19]] [[4 5 6 7] [12 13 14 15] [20 21 22 23]]] (vii) # a@b # - return the matrix multiplication of the two arrays a and b. D = A.reshape(2, 5) print(B@D) Answer: [[15 17 19 21 23] [15 17 19 21 23] [15 17 19 21 23]] (viii) # np.dot(a,b) # - return the dot product of the two arrays a and b. If both a and b are # 1-D arrays, it is the inner product of vectors and returns a scalar. # If both a and b are bool arrays, the output is in bool datatype. # np.ndarray.astype(dtype) # - return the copy of the array with a specified dtype. E = A < 12 F = A % 5 == 0 print(np.dot(E, F).astype(int)) Answer: (ix) print(np.dot(E, F.astype(int))) Answer: (x) print( C / B ) Answer: Error (xi) # np.newaxis # - increase the dimension of an array by adding new axis. print(C / B[..., np.newaxis]) Answer: [[[0,1,2,3], [4,5,6,7], [[8,9,10,11], [12,13,14,15]], [[16,17,18,19], [20,21,22,23]]] (xii) G = C[0, :, 2:] print(G) Answer: [[2 3] [6 7]] (xiii) C[0, 1, 2] = 100 print(G) Answer: [[2 3] [100 7]] Scheme: ˆ 1 point for giving the correct answer for each part. 13 points in total. (b) [6 points] In recommendation systems, we often recommend items similar to what the user likes. This is known as content-based filtering. In content-based filtering, an item is represented as a feature vector (or a 1D array). Given the following code which computes the cosine similarity between feature vectors of items in explicit loops: def compute_cosine_similarity_loops(X): num_items, num_features = X.shape # --- BLOCK TO REWRITE --- X_normalized = np.zeros((num_items, num_features)) for i in range(num_items): feature_norm = np.sqrt(np.sum(X[i] ** 2)) X_normalized[i] = X[i] / feature_norm similarities = np.zeros((num_items, num_items)) for i in range(num_items): for j in range(num_items): similarities[i, j] = np.sum(X_normalized[i] * X_normalized[j]) # --- BLOCK TO REWRITE --- return similarities X = np.array([[0, 2], [1, -1], [1, 1]]) print(compute_cosine_similarity_loops(X)) # Output: # [[ 1. -0.70710678 0.70710678] # [-0.70710678 1. 0. ] # [ 0.70710678 0. 1. ]] Rewrite the block of code between the comment lines “# --- BLOCK TO REWRITE --- ” using no explicit loops in the space provided. You may find the following functions useful for this question. ˆ Element-wise square of an array: np.square(A) – A is the input array This is equivalent to A ** 2. ˆ Element-wise square root of an array: np.sqrt(A) – A is the input array ˆ Sum of array elements over a given axis: np.sum(A, axis) – A is the input array – axis is the axis across which the array is summed ˆ Insert a new axis of size 1 to an array: np.expand_dims(A, axis) – A is the input array – axis is the position where the axis is to be inserted If axis is 0, this is equivalent to A[np.newaxis] and A[None]. If axis is 1, this is equivalent to A[:, np.newaxis] and A[:, None]. ˆ Transpose of an array: np.transpose(A) – A is the input array This is equivalent to A.T. ˆ Matrix multiplication: np.matmul(A, B) – A is the left array for matrix multiplication – B is the right array for matrix multiplication This is equivalent to A @ B. More information on matrix multiplication: suppose A.shape[1] == B.shape[0], then C = np.matmul(A, B) means that for each i and j, C[i, j] == np.sum(A[i] * B[:, j]) Write your code in the space below. Answer: X_norm = np.sqrt(np.sum(X ** 2, 1)) # (n,) # 2 points X_normalized = X / X_norm[:, None] # (n, d) # 2 points similarities = X_normalized @ X_normalized.T # (n, n) # 2 points The solution is not unique.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'hard', '', '', ''), ('COMP2211-2024-spring-midterm', '3', NULL, 3, 'long_question', 'coding', 'Problem 3 [16 points] Model Evaluation and Advanced Python Programming In this problem, you need to implement the evaluation metrics for multi-class classifiers. Specifically, you need to implement the confusion matrix, accuracy, precision, recall, and macro F1 score. We provide the related definitions and formulas as follows. (a) [5.5 points] Suppose there is a test dataset consisting of 10 data points, their actual classes are [2, 1, 1, 2, 0, 1, 0, 0, 1, 1], and their predicted classes by a classifier model are [2, 1, 2, 1, 0, 2, 1, 0, 0, 0]. What is the confusion matrix for the prediction results? What are TP, TN, FP, FN for each class? Confusion matrix: a table that summarized actual labels and the predictions of classifi- cation. For multi-class classification, the confusion matrix has the shape (num classes, num classes) that records the number of occurrences between actual labels and the pre- dictions. The classes are listed in the same order in the rows as in the columns, therefore the correctly classified elements are located on the main diagonal. For each class i, TPi, TNi, FPi, FNi represent the instance numbers of true positive, true negative, false positive, and false negative, respectively. ˆ True positive: A test result where the classifier correctly predicts the positive class as positive. ˆ True negative: A test result where the classifier correctly predicts the negative class as negative. ˆ False positive: A test result where the classifier incorrectly predicts the negative class as positive. ˆ False negative: A test result where the classifier incorrectly predicts the positive class as negative. Please fill in the confusion matrix (the rows represent actual class and the columns represent predicted class) and the TP, TN, FP, FN table. Predicted Actual Class Class TP TN FP FN (b) [1 point] What is the accuracy score of the classifier model on the above test data? Accuracy = PN i=1 TPi num testdata, where N is the number of classes. (c) [2.5 points] What are the precisions, recalls, and F1 scores for each class of the classifier model on the above test data? Please fill in the table using fractions or keep 3 decimals. For each class i, Precisioni = TPi TPi+FPi For each class i, Recalli = TPi TPi+FNi For each class i, F1i = 2·Precisioni·Recalli Precisioni+Recalli Class Precision Recall F1 score (d) [1 point] What is the macro F1 score of the classifier model on the above test data? Please keep 3 decimals in your answer. The macro F1 score is the unweighted mean of the F1 scores of all classes: Macro-F1 = PN i=1 F1i N , where N is the number of classes. (e) [6 points] Given two NumPy 1D arrays with the same shape (num_testdata,): test_actual and test_predict, representing the actual class labels and predicted class labels for the test data, please implement the following functions by filling in the blanks. For each TODO, please use a one-line Python expression. import numpy as np def generate_confusion_matrix(test_actual, test_predict): # TODO 1: Get num_classes, the number of classes in the test data. # Note that the classes in the test_actual and test_predict are represented # in integer indices from [0, 1, ..., num_classes - 1]. num_classes = _________________________________________ confusion_matrix = np.zeros((num_classes, num_classes)) # TODO 2: Get the values of confusion_matrix, where the rows represent # actual class and the columns represent predicted class. for i in range(0, num_classes): for j in range(0, num_classes): confusion_matrix[i, j] = _____________________________________ return confusion_matrix def calculate_evaluation_metrics(test_actual, test_predict): confusion_matrix = generate_confusion_matrix(test_actual, test_predict) # TODO 3: Get the accuracy score, which is a scalar value. accuracy = ______________________________________________________ # TODO 4: Get the precisions for all classes, which is a 1D array # with shape (num_classes, ). precision = _____________________________________________________ # TODO 5: Get the recalls for all classes, which is a 1D array # with shape (num_classes, ). recall = ________________________________________________________ # TODO 6: Get the macro F1 score, which is a scalar value. macro_f1 = ______________________________________________________ return accuracy, precision, recall, macro_f1 Note: ˆ An expression is a combination of values, variables, operators, and calls to functions. ˆ There must be no explicit loops in your expression. ˆ Your implemented functions should work with any number of test data points and any number of classes. ˆ You cannot use any variable that is not defined inside the function or any global variable. You may find the following attribute or functions useful for this problem. ˆ np.max(a, axis = None) - return the maximum of the array a along the given axis. If axis is None, the result is a scalar value. ˆ np.ndarray.sum(axis = None) - return the sum of the array over the given axis. If axis is None, the result is a scalar value. ˆ np.ndarray.diagonal() - if the array is 2D, then a 1D array containing the diagonal elements is returned. ˆ np.mean(a, axis) - return a new array containing the average values of a over the specified axis. If axis is None, the result is a scalar value. Write your code in the space below.', 16, 10, NULL::jsonb, NULL, NULL, 'Problem 3 [16 points] Model Evaluation and Advanced Python Programming In this problem, you need to implement the evaluation metrics for multi-class classifiers. Specifically, you need to implement the confusion matrix, accuracy, precision, recall, and macro F1 score. We provide the related definitions and formulas as follows. (a) [5.5 points] Suppose there is a test dataset consisting of 10 data points, their actual classes are [2, 1, 1, 2, 0, 1, 0, 0, 1, 1], and their predicted classes by a classifier model are [2, 1, 2, 1, 0, 2, 1, 0, 0, 0]. What is the confusion matrix for the prediction results? What are TP, TN, FP, FN for each class? Confusion matrix: a table that summarized actual labels and the predictions of classifi- cation. For multi-class classification, the confusion matrix has the shape (num classes, num classes) that records the number of occurrences between actual labels and the pre- dictions. The classes are listed in the same order in the rows as in the columns, therefore the correctly classified elements are located on the main diagonal. For each class i, TPi, TNi, FPi, FNi represent the instance numbers of true positive, true negative, false positive, and false negative, respectively. ˆ True positive: A test result where the classifier correctly predicts the positive class as positive. ˆ True negative: A test result where the classifier correctly predicts the negative class as negative. ˆ False positive: A test result where the classifier incorrectly predicts the negative class as positive. ˆ False negative: A test result where the classifier incorrectly predicts the positive class as negative. Please fill in the confusion matrix (the rows represent actual class and the columns represent predicted class) and the TP, TN, FP, FN table. Predicted Actual Class Class TP TN FP FN Answer: Predicted Actual Class Class TP TN FP FN Scheme: ˆ 0.25 point for each correct numeric value. 5.25 points in total. ˆ An extra 0.25 point is given for those who gave all the correct numeric values. (b) [1 point] What is the accuracy score of the classifier model on the above test data? Accuracy = PN i=1 TPi num testdata, where N is the number of classes. Answer: 0.4 Scheme: ˆ 1 point for the correct answer. (c) [2.5 points] What are the precisions, recalls, and F1 scores for each class of the classifier model on the above test data? Please fill in the table using fractions or keep 3 decimals. For each class i, Precisioni = TPi TPi+FPi For each class i, Recalli = TPi TPi+FNi For each class i, F1i = 2·Precisioni·Recalli Precisioni+Recalli Class Precision Recall F1 score Answer: Class Precision Recall F1 score 1/2 2/3 4/7 1/3 1/5 1/4 1/3 1/2 2/5 Scheme: ˆ 0.25 point for each correct numeric value (or fraction). 2.25 points in total. ˆ An extra 0.25 point is given for those who gave all the correct numeric values (or fractions). (d) [1 point] What is the macro F1 score of the classifier model on the above test data? Please keep 3 decimals in your answer. The macro F1 score is the unweighted mean of the F1 scores of all classes: Macro-F1 = PN i=1 F1i N , where N is the number of classes. Answer: 0.407 Scheme: ˆ 1 point for the correct answer. (e) [6 points] Given two NumPy 1D arrays with the same shape (num_testdata,): test_actual and test_predict, representing the actual class labels and predicted class labels for the test data, please implement the following functions by filling in the blanks. For each TODO, please use a one-line Python expression. import numpy as np def generate_confusion_matrix(test_actual, test_predict): # TODO 1: Get num_classes, the number of classes in the test data. # Note that the classes in the test_actual and test_predict are represented # in integer indices from [0, 1, ..., num_classes - 1]. num_classes = _________________________________________ confusion_matrix = np.zeros((num_classes, num_classes)) # TODO 2: Get the values of confusion_matrix, where the rows represent # actual class and the columns represent predicted class. for i in range(0, num_classes): for j in range(0, num_classes): confusion_matrix[i, j] = _____________________________________ return confusion_matrix def calculate_evaluation_metrics(test_actual, test_predict): confusion_matrix = generate_confusion_matrix(test_actual, test_predict) # TODO 3: Get the accuracy score, which is a scalar value. accuracy = ______________________________________________________ # TODO 4: Get the precisions for all classes, which is a 1D array # with shape (num_classes, ). precision = _____________________________________________________ # TODO 5: Get the recalls for all classes, which is a 1D array # with shape (num_classes, ). recall = ________________________________________________________ # TODO 6: Get the macro F1 score, which is a scalar value. macro_f1 = ______________________________________________________ return accuracy, precision, recall, macro_f1 Note: ˆ An expression is a combination of values, variables, operators, and calls to functions. ˆ There must be no explicit loops in your expression. ˆ Your implemented functions should work with any number of test data points and any number of classes. ˆ You cannot use any variable that is not defined inside the function or any global variable. You may find the following attribute or functions useful for this problem. ˆ np.max(a, axis = None) - return the maximum of the array a along the given axis. If axis is None, the result is a scalar value. ˆ np.ndarray.sum(axis = None) - return the sum of the array over the given axis. If axis is None, the result is a scalar value. ˆ np.ndarray.diagonal() - if the array is 2D, then a 1D array containing the diagonal elements is returned. ˆ np.mean(a, axis) - return a new array containing the average values of a over the specified axis. If axis is None, the result is a scalar value. Write your code in the space below. Answer: TODO 1: np.max(test actual) + 1 or np.max(test predict) + 1 TODO 2: (test actual == 1) & (test predict == j)).sum() TODO 3: confusion matrix.diagonal().sum() / confusion matrix.sum() TODO 4: confusion matrix.diagonal() / confusion matrix.sum(axis = 0) TODO 5: confusion matrix.diagonal() / confusion matrix.sum(axis = 1) TODO 6: np.mean(2 * precision * recall) / (precision + recall)) Scheme: ˆ 1 point for each TODO. 6 points in total.', ARRAY['Evaluation and Validation']::TEXT[], 'Evaluation and Validation', 'Evaluation and Validation', ARRAY['Evaluation and Validation']::TEXT[], ARRAY['metric_computation', 'experimental_design', 'reasoning']::TEXT[], 'hard', '', '', ''), ('COMP2211-2024-spring-midterm', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [16 points] Na¨ıve Bayes Classifier Based on the given training data in the table below, which includes both numerical and categorical attributes, make a prediction about the degree classification (DC) (i.e., First-Class Honors or Second-Class Honors D1 or Second-Class Honors D2, or Third-Class Honors) of the student with the following attribute values using Na¨ıve Bayes classifier. ˆ Study Attitude (SA): Serious ˆ Part-time Job (PTJ): No ˆ Average Energy Level (AEL): 7.2 ˆ Courses Taught by Desmond and Pearl (CDP): Yes ˆ Number of Friends in the Study Group (NFSG): 4 Student Study Attitude (SA) Categorical Part-time Job (PTJ) Categorical Average Energy Level (AEL) Numerical Courses Taught by Desmond and Pearl (CDP) Categorical Number of Friends in the Study Group (NFSG) Categorical Degree Classification (DC) Serious No 8.5 Yes First Moderate Yes 6.2 No Second D1 Serious No 7.8 Yes Second D1 Moderate Yes 5.9 No Third Serious No 8.1 Yes First Casual Yes 6.5 Yes Second D2 Serious No 7.3 No Second D1 Serious No 7.9 Yes First Moderate Yes 5.7 No Third Serious No 8.2 Yes First Casual Yes 6.1 Yes Second D2 Serious No 7.6 No Second D2 Moderate Yes 6.4 Yes Third Serious No 8.0 Yes First Casual Yes 5.8 No Second D2 Serious No 7.5 Yes Second D1 Serious No 7.7 Yes First Moderate Yes 6.0 No Third Serious No 7.8 Yes First Casual Yes 6.3 Yes Second D2 Assume each categorical attribute has the following possible values: ˆ Study Attitude: Serious, Moderate, Casual ˆ Part-time Job: Yes, No ˆ Courses Taught by Desmond and Pearl: Yes, No ˆ Number of Friends in the Study Group: 1, 2, 3, 4, 5, 6 Assume that the numerical data follow a Gaussian distribution: f(x) = σ √ 2πexp  −(x −µ)2 2σ2  where numerical training data = (x1, x2, . . . , xn) µ = 1 n n X i=1 xi, σ = v u u t n −1 n X i=1 (xi −µ)2 If needed, apply 1-Laplace Smoothing to the likelihood probabilities of the affected feature only. The affected feature means that the categorical feature has a category given some belief in the test dataset, which was NOT observed in the training dataset. Please provide all the steps. You may find the following equation useful for this question: BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) · · · P(ed|Bi)) .(a) [4 points] Calculate the mean (µ) and standard deviation (σ) of the Average Energy Level (AEL) of Degree Classification: First-Class Honors and Second-Class Honors D1. (b) [3 points] Calculate the test data sample’s likelihoods of Average Energy Level (AEL) of First-Class Honors and Second-Class Honors D1. (c) [4 points] Calculate the test data sample’s likelihoods of Study Attitude (SA), Part-time Job (PTJ), Courses Taught by Desmond and Pearl (CDP), and Number of Friends in the Study Group (NFSG) of all Degree Classification(s) (DC). (d) [2 points] Calculate the prior probabilities. (e) [3 points] Finally, calculate the posterior probabilities and make the prediction. Assume that the likelihood of Average Energy Level (AEL) of Second-Class Honors D2 and Third-Class Honors are: ˆ P(AEL=7.2|Second-Class Honors D2) = 0.32515 ˆ P(AEL=7.2|Third-Class Honors) = 0.000334158', 16, 14, NULL::jsonb, NULL, NULL, 'Problem 4 [16 points] Na¨ıve Bayes Classifier Based on the given training data in the table below, which includes both numerical and categorical attributes, make a prediction about the degree classification (DC) (i.e., First-Class Honors or Second-Class Honors D1 or Second-Class Honors D2, or Third-Class Honors) of the student with the following attribute values using Na¨ıve Bayes classifier. ˆ Study Attitude (SA): Serious ˆ Part-time Job (PTJ): No ˆ Average Energy Level (AEL): 7.2 ˆ Courses Taught by Desmond and Pearl (CDP): Yes ˆ Number of Friends in the Study Group (NFSG): 4 Student Study Attitude (SA) Categorical Part-time Job (PTJ) Categorical Average Energy Level (AEL) Numerical Courses Taught by Desmond and Pearl (CDP) Categorical Number of Friends in the Study Group (NFSG) Categorical Degree Classification (DC) Serious No 8.5 Yes First Moderate Yes 6.2 No Second D1 Serious No 7.8 Yes Second D1 Moderate Yes 5.9 No Third Serious No 8.1 Yes First Casual Yes 6.5 Yes Second D2 Serious No 7.3 No Second D1 Serious No 7.9 Yes First Moderate Yes 5.7 No Third Serious No 8.2 Yes First Casual Yes 6.1 Yes Second D2 Serious No 7.6 No Second D2 Moderate Yes 6.4 Yes Third Serious No 8.0 Yes First Casual Yes 5.8 No Second D2 Serious No 7.5 Yes Second D1 Serious No 7.7 Yes First Moderate Yes 6.0 No Third Serious No 7.8 Yes First Casual Yes 6.3 Yes Second D2 Assume each categorical attribute has the following possible values: ˆ Study Attitude: Serious, Moderate, Casual ˆ Part-time Job: Yes, No ˆ Courses Taught by Desmond and Pearl: Yes, No ˆ Number of Friends in the Study Group: 1, 2, 3, 4, 5, 6 Assume that the numerical data follow a Gaussian distribution: f(x) = σ √ 2πexp  −(x −µ)2 2σ2  where numerical training data = (x1, x2, . . . , xn) µ = 1 n n X i=1 xi, σ = v u u t n −1 n X i=1 (xi −µ)2 If needed, apply 1-Laplace Smoothing to the likelihood probabilities of the affected feature only. The affected feature means that the categorical feature has a category given some belief in the test dataset, which was NOT observed in the training dataset. Please provide all the steps. You may find the following equation useful for this question: BNB = argmaxBiP(Bi)(P(e1|Bi)P(e2|Bi)P(e3|Bi) · · · P(ed|Bi)) .(a) [4 points] Calculate the mean (µ) and standard deviation (σ) of the Average Energy Level (AEL) of Degree Classification: First-Class Honors and Second-Class Honors D1. Answer: ˆ Mean of average energy level given first-class honors = 8.02857 ˆ Standard deviation of average energy level given first-class honors = 0.26904 ˆ Mean of average energy level given second-class honors, D1 = 7.2 ˆ Standard deviation of average energy level given second-class honors, D1 = 0.69761 Scheme: ˆ 1 point for each correct mean value. 2 points in total. ˆ 1 point for each correct standard deviation value. 2 points in totals. (b) [3 points] Calculate the test data sample’s likelihoods of Average Energy Level (AEL) of First-Class Honors and Second-Class Honors D1. Answer: P(AEL = 7.2|First) = 0.26904 √ 2πexp  −(7.2 −8.02857)2 2(0.26904)2  = 0.01293 P(AEL = 7.2|SecondD1) = 0.69761 √ 2πexp  −(7.2 −7.2)2 2(0.69761)2  = 0.57187 Scheme: ˆ 1.5 points for each correct test data sample’s likelihood. 3 points in total. (c) [4 points] Calculate the test data sample’s likelihoods of Study Attitude (SA), Part-time Job (PTJ), Courses Taught by Desmond and Pearl (CDP), and Number of Friends in the Study Group (NFSG) of all Degree Classification(s) (DC). Answer: ˆ As P(SA = Serious|Third) = 0, we need to apply a add-one-trick for the study attitude. We assume that three values (Serious, Moderate, Casual) are equally prob- able: – P(SA = Serious|First) = 7+1 7+3 = 0.8 – P(SA = Serious|SecondD1) = 3+1 4+3 = 0.57 – P(SA = Serious|SecondD2) = 1+1 5+3 = 0.25 – P(SA = Serious|Third) = 0+1 4+3 = 0.1429 ˆ As P(PTJ = No|Third) = 0, we need to apply a add-one-trick for the Part-time Job. We assume that three values (Yes, No) are equally probable: – P(PTJ = No|First) = 7+1 7+2 = 0.89 – P(PTJ = No|SecondD1) = 3+1 4+2 = 0.67 – P(PTJ = No|SecondD2) = 1+1 5+2 = 0.286 – P(PTJ = No|Third) = 0+1 4+2 = 0.167 ˆ P(CDP = Y es|First) = 7 7 = 1 ˆ P(CDP = Y es|SecondD1) = 2 4 = 0.5 ˆ P(CDP = Y es|SecondD2) = 3 5 = 0.6 ˆ P(CDP = Y es|Third) = 1 4 = 0.25 ˆ As P(NFSG = 4|First) = 0, we need to apply a add-one-trick for the Number of friends in the Study Group. We assume that six values (1,2,3,4,5,6) are equally probable: – P(NFSG = 4|First) = 0+1 7+6 = 0.0769 – P(NFSG = 4|SecondD1) = 3+1 4+6 = 0.4 – P(NFSG = 4|SecondD2) = 1+1 5+6 = 0.18 – P(NFSG = 4|Third) = 0+1 4+6 = 0.1 Scheme: ˆ 0.25 point for each correct test data sample’s likelihood. 4 points in total. (d) [2 points] Calculate the prior probabilities. Answer: ˆ P(First) = 7 20 = 0.35 ˆ P(SecondD1) = 4 20 = 0.2 ˆ P(SecondD2) = 5 20 = 0.25 ˆ P(Third) = 4 20 = 0.2 Scheme: ˆ 0.5 point for each correct prior probability. 2 points in total. (e) [3 points] Finally, calculate the posterior probabilities and make the prediction. Assume that the likelihood of Average Energy Level (AEL) of Second-Class Honors D2 and Third-Class Honors are: ˆ P(AEL=7.2|Second-Class Honors D2) = 0.32515 ˆ P(AEL=7.2|Third-Class Honors) = 0.000334158 Answer: ˆ P(First|E) = (0.35)(0.8)(0.89)(0.01293)(1)(0.0769) P(E) = 0.0002478 P(E) ˆ P(SecondD1|E) = (0.2)(0.57)(0.67)(0.57187)(0.5)(0.4) P(E) = 0.0087359 P(E) ˆ P(SecondD2|E) = (0.25)(0.25)(0.286)(0.32515)(0.6)(0.18) P(E) = 0.0006277 P(E) ˆ P(Third|E) = (0.2)(0.1429)(0.167)(0.000334158)(0.25)(0.1) P(E) = 3.9872×10−8 P(E) Therefore, the Na¨ıve Bayes classifier predicts “Degree Classification” = Second-Class Honors D2 for the student. Scheme: ˆ 0.5 point for each correct posterior probability. 2 points in total. ˆ 1 point for making the correct prediction.', ARRAY['Probabilistic Models']::TEXT[], 'Probabilistic Models', 'Probabilistic Models', ARRAY['Probabilistic Models']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision']::TEXT[], 'hard', '', '', ''), ('COMP2211-2024-spring-midterm', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 Part(b) Continued (c) [2 points] Based on the chosen K and those 6 training samples in part (b), calculate the test error for the test dataset below. Note: ˆ When selecting a neighbor, to resolve ties, choose the neighbor with the lowest index. ˆ When evaluating the error, use – Number of wrong predictions / Number of test data points . Attribute 1 Attribute 2 Class 3.4 B 2.8 A 3.5 A 2.5 B', 18, 19, NULL::jsonb, NULL, NULL, 'Problem 5 Part(b) Continued (c) [2 points] Based on the chosen K and those 6 training samples in part (b), calculate the test error for the test dataset below. Note: ˆ When selecting a neighbor, to resolve ties, choose the neighbor with the lowest index. ˆ When evaluating the error, use – Number of wrong predictions / Number of test data points . Attribute 1 Attribute 2 Class 3.4 B 2.8 A 3.5 A 2.5 B Answer: Using K=3, test error is 0. Details: **Test Data** ([3.4, 3.0], B) k nearest neighbors, [[3.2, 2.9], B, [2.7, 3.0], A, [3.4, 3.8], B] predict label B error: 0 ([3.0, 2.8], A) k nearest neighbors, [[3.2, 2.9], B, [2.7, 3.0], A, [2.6, 2.0], A] predict label A error: 0 ([2.0, 3.5], A) k nearest neighbors, [[2.5, 3.7], A, [2.7, 3.0], A, [3.4, 3.8], B] predict label A error: 0 ([2.5, 7.0], B) k nearest neighbors, [[2.5, 3.7], A, [3.5, 4.0], B, [3.4, 3.8], B] predict label B error: 0 total error for test data: 0/4 Scheme: ˆ 2 points for giving the total error for 6-cross validation for K = 3.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''), ('COMP2211-2024-spring-midterm', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [16 points] Leader Clustering Consider the following cluster method called Leader Clustering. It receives two parameters: an integer K and a real-value threshold T. Similar to K-means clustering, it starts by selecting K instance (which will be called leaders) and assigns each training instance to the cluster of the closest leader. During this assignment step, however, if the distance of a training instance to its closest leader is greater than the input threshold T, then this training instance forms a new cluster and becomes the initial leader of this new cluster. After all the training instances have been assigned to a cluster, the leader of each cluster is updated as the mean of the cluster. The process is then repeated until the cluster assignments do not change. (a) [6 points] Given a 1-dimensional data set { 1, 3, 5, 9, 11, 13, 15 }, use the Leader Clustering algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for forming new clusters T = 5. Fill in the following table of the first assignment iteration with your completed values and what are the leaders after the first assignment iteration? (If new clusters are formed in the process, named their leaders as c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.) Data point Distance between the data point and c1 Distance between the data point and c2 Distance between the data point and c3 (if needed) Distance between the data point and c4 (if needed) Closest Centroid The leaders after the first assignment iteration: (b) [6 points] Given a 1-dimensional data set {5, 9, 11, 13, 17, 19}, use the Leader Clustering algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 5 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for forming new clusters T = 5. Fill in the following table of the first assignment iteration with your computed values and what are the leaders after the first assignment iteration? (If new clusters are formed in the process, named their leaders as c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.) Data point Distance between the data point and c1 Distance between the data point and c2 Distance between the data point and c3 (if needed) Distance between the data point and c4 (if needed) Closest Centroid The leaders after the first assignment iteration: (c) [2 points] Which of the two methods, K-Means Clustering or Leader Clustering, will be better at dealing with outliers? Please briefly explain. (d) [2 points] During lectures, we have learned that one drawback of the K-Means Cluster- ing algorithm is that we need to specify K for the algorithm, but usually, we don’t know how many clusters there should be for an unlabeled dataset. Will the Leader Clustering mitigate this drawback? Will there be any related limitations of the Leader Clustering algorithm? Please briefly explain.', 16, 21, NULL::jsonb, NULL, NULL, 'Problem 6 [16 points] Leader Clustering Consider the following cluster method called Leader Clustering. It receives two parameters: an integer K and a real-value threshold T. Similar to K-means clustering, it starts by selecting K instance (which will be called leaders) and assigns each training instance to the cluster of the closest leader. During this assignment step, however, if the distance of a training instance to its closest leader is greater than the input threshold T, then this training instance forms a new cluster and becomes the initial leader of this new cluster. After all the training instances have been assigned to a cluster, the leader of each cluster is updated as the mean of the cluster. The process is then repeated until the cluster assignments do not change. (a) [6 points] Given a 1-dimensional data set { 1, 3, 5, 9, 11, 13, 15 }, use the Leader Clustering algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 3 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for forming new clusters T = 5. Fill in the following table of the first assignment iteration with your completed values and what are the leaders after the first assignment iteration? (If new clusters are formed in the process, named their leaders as c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.) Data point Distance between the data point and c1 Distance between the data point and c2 Distance between the data point and c3 (if needed) Distance between the data point and c4 (if needed) Closest Centroid The leaders after the first assignment iteration: Answer: Data point Distance between the data point and c1 Distance between the data point and c2 Distance between the data point and c3 (if needed) Distance between the data point and c4 (if needed) Closest Centroid c1 c1 c1 c2 c2 c2 c2 After the first assignment iteration, c1 = (1+3+5)/3 = 3, c2 = (9+11+13+15)/4 = 12. Scheme: ˆ 0.25 point for each correct numeric value (or label). 5.25 points in total. ˆ 0.25 point for the correct values of the leaders. If the values of both leader are correct, 0.75. If only one leader is correct, 0.25. (b) [6 points] Given a 1-dimensional data set {5, 9, 11, 13, 17, 19}, use the Leader Clustering algorithm and Euclidean distance to cluster the given points in the data set into 2 clusters. Assume c1 = 5 and c2 = 11 are chosen as the initial K = 2 leaders and the threshold for forming new clusters T = 5. Fill in the following table of the first assignment iteration with your computed values and what are the leaders after the first assignment iteration? (If new clusters are formed in the process, named their leaders as c3, c4, ... based on the order. Leave the distance c3/c4/... blank if the new cluster(s) are not formed yet.) Data point Distance between the data point and c1 Distance between the data point and c2 Distance between the data point and c3 (if needed) Distance between the data point and c4 (if needed) Closest Centroid The leaders after the first assignment iteration: Answer: Data point Distance between the data point and c1 Distance between the data point and c2 Distance between the data point and c3 (if needed) Distance between the data point and c4 (if needed) Closest Centroid c1 c2 c2 c2 c3 c3 After the first assignment iteration, c1 = 5, c2 = (9+11+13)/3 = 11, c3 = (17+19)/2 = 18. Scheme: ˆ 0.25 point for each correct numeric value (or label). 5 points in total. ˆ 1 point for the correct values of the leaders. 1 point in total. If there are any incorrect leaders in the answer, 0 point (c) [2 points] Which of the two methods, K-Means Clustering or Leader Clustering, will be better at dealing with outliers? Please briefly explain. Answer: Leader clustering is more robust (better at dealing with) outliers. This is because new cluster will be generated and outliers will be assigned to the new cluster without influ- encing the other clusters. Scheme: ˆ 1 point for stating which method is better at dealing with outliers. ˆ 1 point for the explanation. (d) [2 points] During lectures, we have learned that one drawback of the K-Means Cluster- ing algorithm is that we need to specify K for the algorithm, but usually, we don’t know how many clusters there should be for an unlabeled dataset. Will the Leader Clustering mitigate this drawback? Will there be any related limitations of the Leader Clustering algorithm? Please briefly explain. Answer: The Leader Clustering can mitigate this drawback of specifying K because the algorithm may get increment on the number of clusters during training. But now the limitation is that we have to specify T for the Leader Clustering algorithm. The threshold value will influence whether the algorithm increases new leader (new cluster) or not. The clustering results will be sensitive to T. Scheme: ˆ 1 point for stating whether the Leader Clustering mitigate the drawback. ˆ 1 point for the explanation. Explanations including: 1. More computation/complexity 2. Sensitive to the initial choice of leaders 3. Cannot control the number of clusters will not get any point.', ARRAY['KNN and Clustering']::TEXT[], 'KNN and Clustering', 'KNN and Clustering', ARRAY['KNN and Clustering']::TEXT[], ARRAY['manual_computation', 'distance_calculation', 'algorithm_tracing']::TEXT[], 'hard', '', '', ''), ('COMP2211-2024-spring-midterm', '7', NULL, 7, 'long_question', 'coding', 'Problem 7 Continued: -------------------- END OF PAPER -------------------- /* Rough work */ /* Rough work */ /* Rough work */ /* Rough work */', 10, 24, NULL::jsonb, NULL, NULL, 'Problem 7 Continued: Answer: D = 2, average accuracy = 0%: In this case, the dataset was divided into two folds. Since the samples from each class were grouped together, one fold contained samples from classes 0 and 1, and the other fold contained samples from classes 2 and 3. As a result, the classifier failed to correctly classify any of the samples because it was not trained on the classes present in the test fold. D = 3, average accuracy = 50%: In this case, the dataset was divided into three folds. Fold 1 contained samples from classes 0 and 1, fold 2 contained samples from classes 1 and 2, and fold 3 contained samples from classes 2 and 3. Using fold 1 and fold 2 as training, and fold 3 as testing, only 25% of test points are classified correctly. ˆ Training: Fold 1, 2, Testing: Fold 3, Accuracy = 25% ˆ Training: Fold 1, 3, Testing: Fold 2, Accuracy = 100% ˆ Training: Fold 2, 3, Testing: Fold 1, Accuracy = 25% So, the average accuracy is (25% + 100% + 25%)/3 = 50%. D = 5, accuracy = 100%: In this case, the dataset was divided into five folds. Fold 1 contained samples from class 0, fold 2 contained samples from class 0 and 1, fold 3 contained samples from class 1 and 2, fold 4 contained samples from class 2 and 3, and fold 5 contained samples from class 3. ˆ Training: Fold 1, 2, 3, 4, Testing: Fold 5, Accuracy: 100% ˆ Training: Fold 1, 2, 3, 5, Testing: Fold 4, Accuracy: 100% ˆ Training: Fold 1, 2, 4, 5, Testing: Fold 3, Accuracy: 100% ˆ Training: Fold 1, 3, 4, 5, Testing: Fold 2, Accuracy: 100% ˆ Training: Fold 2, 3, 4, 5, Testing: Fold 1, Accuracy: 100% So, the average accuracy is (100% + 100% + 100% + 100% + 100%)/5 = 100%. To improve the implementation of D-fold cross-validation, the dataset should be shuffled before dividing it into folds. This will ensure that each fold contains a representative dis- tribution of samples from all classes, reducing bias and providing more reliable evaluation results. Scheme: ˆ 2 points for explaining the result obtained for D = 2. ˆ 3 points for explaining the result obtained for D = 3. ˆ 3 points for explaining the result obtained for D = 5. ˆ 2 points for suggesting improvement(s) to the implementation. -------------------- END OF PAPER -------------------- /* Rough work */ /* Rough work */ /* Rough work */ /* Rough work */', ARRAY['Evaluation and Validation']::TEXT[], 'Evaluation and Validation', 'Evaluation and Validation', ARRAY['Evaluation and Validation']::TEXT[], ARRAY['metric_computation', 'experimental_design', 'reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '1', NULL, 1, 'true_false', 'true_false', 'Problem 1 [10 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table in the answer book. You get 1 point for each correct answer. (a) The console output of the following Python code is [1,2,3,4,5,6,7]. import numpy as np # arange(start, stop): Values are generated within half-open interval [start,stop). array = np.arange(1,10) print(array[:-2]) (b) There is no way for the Na¨ıve Bayes classifier to make a prediction if a categorical feature (e.g., color) has a new category (e.g., blue) not observed in the training data set. (c) K-Nearest Neighbors classifier is a non-parametric machine learning algorithm with an assumption that the data are uniformly distributed. (d) It is possible for K-Means Clustering to return empty clusters if certain initial centroid positions are unfortunate. (e) If there are only two classes to predict, the following Multi-layer Perceptron (MLP) models will have the same output, given that they have the same initial weights and biases, and are trained in the same manner: ˆ MLP 1: Input layer, hidden dense layer with ReLU activation function, output layer with sigmoid activation function. ˆ MLP 2: Input layer, hidden dense layer with ReLU activation function, output layer with softmax activation function. (f) An affine transformation may preserve distances and angles. (g) The number of floating point multiplications involved when a 32×32 pixel RGB image is passed through a 2D convolutional layer with 8 3×3 kernels, padding of 3, and stride length of 1 is 8×3×3×36×36. (h) Setting the dropout rate of a Convolutional Neural Network to 0.5 means that more than 50% of its layer’s outputs are non-zero. (i) Alpha-beta pruning can sometimes change the final decision made by the minimax algo- rithm, resulting in a different move being selected for the current player. (j) Researches that involves human participants should require informed consent.', 10, 2, NULL::jsonb, NULL, NULL, 'Problem 1 [10 points] True/False Questions Indicate whether the following statements are true or false by putting T or F in the given table. You get 1 point for each correct answer. Question (a) (b) (c) (d) (e) (f) (g) (h) (i) (j) Answer T F F T F T F F F T Scheme: ˆ 1 point for giving each correct answer. 10 points in total.', ARRAY['True/False']::TEXT[], 'True/False', 'True/False', ARRAY['True/False']::TEXT[], ARRAY['concept_check', 'rapid_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '2', NULL, 2, 'long_question', 'coding', 'Problem 2 [10 points] Advanced Python: Image Processing with NumPy You are working on an image processing project that involves manipulating arrays to perform various operations on images. You have been provided with a NumPy array img representing a grayscale image of size 512 × 512: import numpy as np img = np.random.randint(0, 256, size=(512, 512), dtype=np.uint8) # Note: random.randint(low, high=None, size=None, dtype=int) # returns random integers from low (inclusive) to high (exclusive). (a) [4 points] Image masking: Implement the following function which creates a new array img masked by applying a mask to img. The mask is defined as a 2D array mask of size 512 × 512, where mask[i, j] = 1 if the pixel at img[i, j] is within a circle (boundary included) of radius 100 centered at a given position [center x, center y], and mask[i, j] = 0 otherwise. def apply_circle_mask(img, center_x, center_y): # --- YOUR CODE HERE --- return img_masked Below is an example for center x = 100 and center y = 200. img img_masked Do NOT use any explicit loop to implement the function. You may find the following functions useful for this question. ˆ np.arange(start, stop, step) returns spaced values within a given interval. – start (optional) is the start of interval. The default start value is 0. – stop is the end of interval. The returned interval does not include this value. – step (optional) is the spacing between values. The default spacing is 1. ˆ np.square(a) returns the element-wise square of an array. – a is the input array. This is equivalent to a ** 2. ˆ np.sqrt(a) returns the element-wise square root of an array. – a is the input array. ˆ np.expand dims(a, axis) returns a new array with a new axis of size 1 inserted. – a is the input array. – axis is the position where the axis is to be inserted. If axis is 0, this is equivalent to a[np.newaxis] and a[None]. If axis is 1, this is equivalent to a[:, np.newaxis] and a[:, None]. (b) [6 points] Image blurring: Your task is to use NumPy to apply the following 3 × 3 blur filter to img with zero padding so that img blur has the same shape as img. blur_filter = np.array([[1/9, 1/9, 1/9], [1/9, 1/9, 1/9], [1/9, 1/9, 1/9]]) Implement the following function so that after the whole code snippet below is executed, img blur stores the desired result. def img_flatten_conv_1d(img, v): # --- YOUR CODE HERE --- return img_conv v = blur_filter.sum(0) # ''sum(0)'' returns the sum of the array elements over axis 0. img_blur = img_flatten_conv_1d(img, v) img_blur = img_flatten_conv_1d(img_blur.T, v).T Do NOT use any explicit loop in your code. You may find the following functions useful for this question. ˆ np.convolve(a, v, mode=''full'') returns the discrete, linear convolution of two one-dimensional sequences. – a of shape (N, ): First one-dimensional input array (or array-like structure, e.g., list). – v of shape (M, ): Second one-dimensional input array (or array-like structure, e.g., list). – mode (optional): The convolution mode. It must be one of ''full'', ''valid'', ''same''. Note: use ''valid'' for this question. When ''valid'' is used, np.convolve is equivalent to the following function: def convolve_valid(a, v): if len(a) < len(v): a, v = v, a # swap the array if v is longer than a c = np.zeros(len(a) - len(v) + 1) for i in range(len(c)): c[i] = np.sum(a[i:i+len(v)] * v[::-1]) return c Examples: >>> np.convolve([1,2,3],[0,1,0.5], ''valid'') array([2.5]) # this is the output array >>> np.convolve([1,2,3,4],[0,1,0.5], ''valid'') array([2.5, 3.5]) # this is the output array ˆ np.zeros(shape, dtype=float) returns a new array of given shape and type, filled with zeros. – shape is the shape of the new array, e.g., (2, 3) or 2. – dtype (optional) is the desired data type for the array, e.g., np.int8. The default is np.float64. ˆ np.concatenate((a1, a2, ...), axis=0) joins a sequence of arrays along an ex- isting axis. – a1, a2, ... is a sequence of arrays (or array-like structure, e.g., list). The arrays must have the same shape, except in the dimension corresponding to axis (the first, by default). – axis (optional) is the axis along which the arrays will be joined. If axis is None, arrays are flattened before use. The default is 0. ˆ np.reshape(a, newshape) gives a new shape to an array without changing its data. – a is the array to be reshaped. – newshape is the new shape. It should be compatible with the original shape. If an integer, then the result will be a 1D array of that length. One shape dimension can be −1. In this case, the value is inferred from the length of the array and the remaining dimensions (if any). This is equivalent to a.reshape(newshape). ˆ np.transpose(a) returns the transpose of an array. – a is the input array. This is equivalent to a.T. Hints: (1) In this case, applying blur filter to the image can also be done by consecutively applying two 1D filters, one vertically and the other horizontally, to the image. (2) Since np.convolve only accepts 1D arrays, you may consider flattening the image array, applying np.convolve to the flattened array, and then reshaping it back to a 2D array. (3) Be aware of the boundaries since np.convolve with mode=''valid'' does not pad the array and the output array does not always have the same shape as the input array. Also, remember to remove the padding (if any) after convolutions.', 10, 3, NULL::jsonb, NULL, NULL, 'Problem 2 [10 points] Advanced Python: Image Processing with NumPy Solution: (a) def apply_circle_mask(img, center_x, center_y): indices = np.arange(512) x_dist = indices - center_x y_dist = indices - center_y dist = np.sqrt(x_dist ** 2 + y_dist[:, None] ** 2) img_masked = img * (dist <= 100) return img_masked (b) def img_flatten_conv_1d(img, v): zeros = np.zeros((512, 1)) img_padded = np.concatenate((zeros, img, zeros), axis=1) img_flat = img_padded.reshape(-1) img_flat_conv = np.convolve(img_flat, v, ''valid'') img_flat_conv = np.concatenate(([0], img_flat_conv, [0])) img_conv = img_flat_conv.reshape(512, 514)[:, 1:-1] return img_conv v = [1/3, 1/3, 1/3] img_blur = img_flatten_conv_1d(img, v) img_blur = img_flatten_conv_1d(img_blur.T, v).T Scheme: (a) If no explicit loop is used, 1 point for each of the following: ˆ correct broadcasting to create 2D distance array; ˆ correct distance values; ˆ correct mask (0.5 point for the opposite mask); ˆ correct final result. If explicit loops are used, 1 point in total if the final result is correct. (b) If no explicit loop is used, 1 point for each of the following: ˆ correct padding before flattening the image array (0.5 point if padding is applied after flattening); ˆ correct flattening of the image array; ˆ correct dimensions of inputs (1D arrays) to np.convolve; ˆ correct values of inputs to np.convolve; ˆ correct padding after np.convolve; ˆ correct reshaping back to 2D image and removing padding (0.5 each). If explicit loops are used, 1 point in total if the final result is correct.', ARRAY['Python Fundamentals']::TEXT[], 'Python Fundamentals', 'Python Fundamentals', ARRAY['Python Fundamentals']::TEXT[], ARRAY['implementation', 'code_tracing', 'debugging']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '3', NULL, 3, 'long_question', 'long_answer', 'Problem 3 [12 points] Na¨ıve Bayes, K-Nearest Neighbors and Perceptron Below is some health data of patients with and without dengue fever. Patient # Diagnosis (B) Body Temperature (Celsius) (e1) Pulse Rate (bpm) (e2) Dengue Dengue Dengue No dengue No dengue 36.5 No dengue You can assume the data follows Gaussian distribution: f(x) = √ 2πσ2 exp(−(x −µ)2 2σ2 ) where µ = 1 n n X i=1 xi σ = v u u t n −1 n X i=1 (xi −µ)2 (a) [6 points] Using the data above, calculate the following. Round off calculations to the fourth decimal place. (i) P(e1 = 36 | B = No dengue) (ii) P(e2 = 85 | B = No dengue) (iii) P(e1 = 36 | B = No dengue) P(e2 = 85 | B = No dengue) P(B = No dengue) (iv) Calculate the posterior probability that the belief is ‘No dengue’ for the sample with evidence E = {e1 = 36, e2 = 85} if P((e1 = 36, e2 = 85)|B = Dengue)P(B = Dengue) = 0.0000036 (b) [4.5 points] Identify the reasons for inaccurate predictions when using the following number of folds to evaluate the performance of a 3-Nearest Neighbors classifier model on the given dataset (i.e., not including the sample introduced in part (a)) without shuffling. (i) No. of folds = 6 (ii) No. of folds = 3 (iii) No. of folds = 2 (c) [1.5 points] If the true label of the sample {e1 = 36, e2 = 85} is ‘No dengue’, will the perceptron model make a good prediction for the sample? Provide explanations with evidence for why or why not.', 12, 6, NULL::jsonb, NULL, NULL, 'Problem 3 [12 points] Na¨ıve Bayes, K-Nearest Neighbors and Perceptron Solution: (a) (i) p 2π(0.5)2 exp  −(36 −36.5)2 2(0.52)  = 0.4839 (ii) p 2π(15)2 exp  −(85 −90)2 2(152)  = 0.0252 (iii) (0.4839)(0.0252)(0.5) = 0.0061 (iv) 0.0061 0.0061 + 0.0000036 = 0.9994 (b) (i) This will predict 5/6 correctly if uniform distance (prediction for Patient 5 is wrong). Also, if inverse distance is used, the prediction for Patient 3 will also be wrong. (ii) It is possible that the data will be folded in such a way that the class opposite to the true label is typically the majority class e.g. (Patient 1, Patient 2), (Patient 3, Patient 4), (Patient 5, Patient 6) . In this case, most folds will have 50% accuracy. The performance may be even worse as there is one pair of samples from opposite classes with minimum pairwise Euclidean distance (Patient 3 and Patient 5). (iii) This is not acceptable because it is possible that the training set for the KNN model will have the class opposite to the true label as its majority class. In this case, the prediction for all samples will be wrong. (c) Yes, because in that case the data is linearly separable. Scheme: (a) (i) 1.5 points for giving the correct answer. (ii) 1.5 points for giving the correct answer. (iii) 1.5 points for giving the correct answer. (iv) 1.5 points for giving the correct answer. (b) (i) 1.5 point for giving the correct reason. (ii) 1.5 point for giving the correct reason. (iii) 1.5 point for giving the correct reason. (c) 0.5 point for stating “Yes”, i.e., perceptron model will make a good prediction for the sample. 1 point for giving the correct explanation.', ARRAY['Probabilistic Models', 'KNN and Clustering', 'Perceptron and MLP']::TEXT[], 'Probabilistic Models', NULL, ARRAY['Probabilistic Models', 'KNN and Clustering', 'Perceptron and MLP']::TEXT[], ARRAY['manual_computation', 'probability_reasoning', 'classification_decision', 'distance_calculation', 'algorithm_tracing', 'forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '4', NULL, 4, 'long_question', 'long_answer', 'Problem 4 [11 points] Multi-layer Perceptron (a) [5 points] Amanda wants to train a multi-layer perceptron to classify various renting options’ popularity. Each housing option is represented with X, a three-dimensional vector (x1, x2, x3). x1 is the noise level rated from 1 to 5. x2 is the proximity to campus, measured in minutes it takes to arrive at the north gate. x3 is the availability of food options rated on a scale from 1 (no food available) to 5 (a wide variety of food available). Amanda has collected the opinions of a group of students on their preferences, classifying the housing options into three types: low, medium, and high, represented as one-hot vector Y = (y1, y2, y3). When designing the network architecture, Amanda has two ideas. Model A Model B Layer (type) Output Shape Layer (type) Output Shape dense 1 (Dense) (None, 6) dense 1 (Dense) (None, 3) dense 2 (Dense) (None, 4) dense 2 (Dense) (None, 5) dense 3 (Dense) (None, 3) dense 3 (Dense) (None, 2) dense 4 (Dense) (None, 3) (i) In a multi-layer perceptron, suppose there are L hidden layers, each layer k (starting from 1) has lk hidden nodes. The input data is a n-dimensional vector, and the output data is m-dimensional. (I) Calculate the number of updated parameters in the MLP model. Represent your result using n, m, L, and lk for k = 1, . . . , L. (II) Apply your result in part (a)(i)(I) to Model A and Model B separately. (ii) Please help Amanda decide on which multi-layer perceptron (A or B) to choose for potentially better performance. Please briefly explain why. (b) [1 point] Why do we use activation functions in multi-layer perceptron neural networks? (c) [2 points] Consider the two activation functions we have learned in class: ˆ Sigmoid activation function: σ(x) = 1+e−x . ˆ Binary step activation function: f(x) =    if x ≤0 otherwise The binary step activation function gives a hard threshold at 0, and its gradients are 0 almost everywhere. What is the problem if we use the binary step activation function in a multi-layer perceptron network? If we want to avoid the problem while approximating the binary step activation function, how to make use of the sigmoid activation function to achieve that? (d) [3 points] You are given an multi-layer perceptron model with the architecture shown below. ˆ ReLU: f(z) = max(0, z). ˆ Softmax: f(zi) = ezi Pn−1 j=0 ezj , where z = [z0, z1, . . . , zn−1]. (i) For a sample with features x1=1 and x2=1, what are the outputs of the hidden layer and the output layer? If necessary, round off the values to two decimal places. (ii) If the target labels have values Tk1=1,Tk2=0 for the sample in part d(i), calculate the new values of the weights: w5, w7, and w1 after one round of backward propagation if the learning rate is 0.4. Round off the values to four decimal places. For reference, here are some of the equations used in the back propagation. δk = (Ok −Tk)Ok(1 −Ok) δj = Oj(1 −Oj) X k∈K δkwjk wjk ←wjk −ηδkOj wij ←wij −ηδjOi', 11, 7, NULL::jsonb, NULL, NULL, 'Problem 4 [11 points] Multi-layer Perceptron Solution: (a) (i) (I) For any hidden layer and the output layer, the parameters include the weight and the bias. (n + 1) × l1 + L−1 X k=1 lk+1 × (lk + 1) + m × (lL + 1) (II) Model A: 67; Model B: 53. (ii) Model A is better because it has more parameters and therefore more expressive. (b) To add non-linearity to the neural network model so that it has more powerful modeling capability. (c) The problem is that we cannot learn the parameters using gradient descent since the gradients are 0 almost everywhere. We can solve the problem while approximating a hard threshold by scaling up the weights in a sigmoid activation function. For example, σ(cx) is steeper than σ(x) and more similar with the binary step function, for c > 1. (d) (i) Hidden layer Oj1 = 0.2, Oj2 = 0 Output layer Ok1 = 0.48, Ok2 = 0.52 (ii) w ′ 5 = w5 −δk1ηOj1 = 0.1 −(−1298)(0.4)(0.2) = 0.1104 w ′ 7 = w7 −δk2ηOj1 = 0 −(1298)(0.4)(0.2) = −0.0104 δj1 = Oj1(1−Oj1)(δk1w5+δk2w7) = 0.2(1−0.2)(−0.1298(0.1)+0.1298(0)) = −0.0021 w ′ 1 = w1 −δj1ηx1 = 0.2 −(0.4)(−0.0021)(1) = 0.2008 Scheme: (a) (i) (I) 1.5 points for giving the correct formula. (II) 1 point for each correct answer. 2 points in total. (ii) 0.5 point for stating Model A is better. 1 point for giving the correct explanation. 1.5 points in total. (b) 1 point for giving the correct answer for why we use activation functions in multi-layer perceptron. (c) 1 point for stating the problem. 1 point for explaining how to make use of the sigmoid function to avoid the problem. 2 points in total. (d) (i) 0.5 point for giving each correct output. 1.5 points in total. (ii) 0.5 point for giving each correct weight value. 1.5 points in total.', ARRAY['Perceptron and MLP']::TEXT[], 'Perceptron and MLP', 'Perceptron and MLP', ARRAY['Perceptron and MLP']::TEXT[], ARRAY['forward_pass', 'backpropagation', 'weight_update']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '5', NULL, 5, 'long_question', 'long_answer', 'Problem 5 [13 points] Digital Image Processing (a) [4.5 points] The following is a grayscale image of the HKUST redbird and its correspond- ing histogram. An grayscale image histogram is a distribution showing the frequency of occurrence of each gray-level value. After some transformation on the original image, we get the transformed images shown below (first row). Their histograms are shuffled and shown in the rows below (second, third, and fourth row). Please state which transformations are applied to get the resulting images for (a),(b),(c), and what is the correct pairing for the images and histograms between {a,b,c} and {d,e,f}. Please also briefly state why. (b) [3 points] Consider the following 3 × 3 image. Perform binary thresholding of the image using Otsu’s method. The initial threshold is T=100, and we apply one iteration. What is the resulting threshold and the resulting image after thresholding? What is the advantage of using Otsu’s Method for image thresholding compared to the regular image thresholding algorithm? (c) [2 points] What is the resulting image of size 7 × 7 after adding reflection padding of size 2 on the original image in part (b)? (d) [1.5 points] Please briefly explain the effect of convolving an image with the following kernels. (i) Kernel 1: 1/9 1/9 1/9 1/9 1/9 1/9 1/9 1/9 1/9 (ii) Kernel 2: -1 -1 -1 (iii) Kernel 3: -1 -1 -1 -1 -1 -1 -1 -1 (e) [2 points] Is it possible to design a 3 × 3 kernel and apply convolution with the kernel to flip a 64Ö64 image horizontally or vertically? If yes, please give such a kernel. If not, please explain why.', 13, 9, NULL::jsonb, NULL, NULL, 'Problem 5 [13 points] Digital Image Processing Solution: (a) (b)-(f): image (horizontal) flipping because the histogram is the same as the original image. (c)-(d): binary thresholding because there are only two values 0 and 255 in the histogram (a)-(e): contrast stretching because the intensity value in the histogram has been stretched to a wider range. (b) After the first iteration µ1=21 , µ2=128, T = 74.5 The resulting images are Compared to regular image thresholding algorithms, Otsu’s method has advantages (i) can automatically determine the threshold value T (ii) the resulting threshold value is reproducible. Given the same image, two researchers using Otsu’s algorithm must arrive at the same threshold. (c) Resulting image: (d) (i) smoothing, (ii) vertical edge detection, (iii) sharpening. (e) No, because the image flipping is a global operation on the image, but convolution with a 3 × 3 kernel is a local operation. Concretely, the 3 × 3 kernels can only capture the input value of 3 × 3 neighbors, but the flipping requires the pixel value information at a longer distance. The longest dependency distance can be 64. Scheme: (a) 1 for stating each transformation correctly. 3 points in total. 0.5 point for giving each correct pairing. 1.5 points in total. (b) 1 point for giving the correct resulting threshold. 1 point for giving the correct result image. 1 point for stating the advantage of using Otsu’s method. 3 points in total. (c) 0.05 for giving each correct value (40 values). 2 points in total. (d) 0.5 point for stating each effect of convolving with the given kernel correctly. 1.5 points in total. (e) 0.5 point for stating it is impossible to design a 3×3 kernel and apply it to flip the 64×64 image. 1.5 points for giving the explanation.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '6', NULL, 6, 'long_question', 'long_answer', 'Problem 6 [13 points] Dilated Convolution and Dropout (a) [10 points] Dilated convolution is a variation of the standard convolution operation that involves skipping input elements by a certain dilation rate. By doing this, the convo- lutional kernel can be made to “see” a larger area of the input data without actually increasing the number of parameters it has. The dilation rate determines the spacing between the values in the kernel. The figure below demonstrates how a 3×3 kernel is applied to a 7×7 image using a dilation factor of 2. Given the incomplete implementation of the Python function dilated convolution that takes the following inputs: ˆ input array: a 2D NumPy array representing the input data ˆ kernel: a 2D NumPy array representing the convolutional kernel ˆ dilation rate: an integer representing the dilation rate (default value is 1) ˆ stride: an integer representing the stride (default value is 1) ˆ padding: a string representing the padding type, either ‘valid’ (no padding) or ‘same’ (padding to preserve input dimensions) (default value is ‘valid’) and returns a 2D NumPy array representing the output of the dilated convolution oper- ation. Complete the missing parts of the function using NumPy, without using any special- ized deep learning libraries so that the execution of the test script produces the required output. Make sure that your implementation supports stride and padding options. You may find the following formula for determining the size of output image of reg- ular image convolution useful for this question. (Size of image dimension - Size of kernel dimension + 2 × Padding) / Stride + 1 import numpy as np # Zero pad the input_array. For example # a = [[1,2,3]] # np.pad(a, ((1,2),(3,4)), ''constant'') # >> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # >> [0, 0, 0, 1, 2, 3, 0, 0, 0, 0], # >> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # >> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) def dilated_convolution(input_array, kernel, dilation_rate=1, stride=1, padding=''valid''): # Apply padding to the input array if specified if padding == ''same'': pad = #______ TODO 1 ______ padded_input = np.pad(input_array, ((pad, pad), (pad, pad)), mode=''constant'') else: padded_input = input_array # Calculate the output shape output_rows = #______ TODO 2 ______ output_cols = #______ TODO 3 ______ kernel_rows, kernel_cols = kernel.shape # Initialize the output array with zeros output_array = np.zeros((output_rows, output_cols)) # Iterate through the kernel and perform the convolution for i in range(kernel_rows): for j in range(kernel_cols): # Calculate the input indices for the current kernel position input_row_indices = #______ TODO 4 ______ input_col_indices = #______ TODO 5 ______ # Perform the convolution and accumulate the results in the output array output_array += #______ TODO 6 ______ return output_array # Test script input_array = np.array(np.arange(100).reshape(10,10)) kernel = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) dilation_rate = 2; stride = 2; padding = ''same'' output_array = dilated_convolution(input_array, kernel, dilation_rate, stride, padding) print(output_array) # Output: # [[ 22. 26. 30. 34. 8.] # [ 62. 66. 72. 78. 34.] # [102. 126. 132. 138. 74.] # [142. 186. 192. 198. 114.] # [ 80. 142. 146. 150. 154.]] You may find the following functions useful for this question. ˆ np.arange(start, stop, step) returns spaced values with a given interval. – start (optional) is the start of interval. The default start value is 0. – stop is the end of interval. The returned interval does not include this value. – step (optional) is the spacing between values. The default spacing is 1. ˆ np.array(object) creates an array. – object is an array or any sequence. ˆ np.reshape(a, newshape) gives a new shape to an array without changing its data. – a is the array to be reshaped. – newshape is the new shape. It should be compatible with the original shape. If an integer, then the result will be a 1D array of that length. One shape dimension can be -1. In this case, the value is inferred from the length of the array and the remaining dimensions (if any). This is equivalent to a.reshape(newshape). ˆ ndarray.shape returns a tuple of array dimensions. ˆ ndarray.zeros(shape, dtype=float) returns a new array of given shape and type, filled with zeros. – shape is the shape of the new array, e.g., (2,3) or 2. – dtype(optional) is the desired data type for the array, e.g., np.int8. The default is np.float64. ˆ range(n) returns a numeric series starting with 0 and extending up to but not including n. (b) [3 points] You are given an incomplete implementation of the Python function dropout, which implements the Dropout technique for regularization in neural networks. The function takes the following input: ˆ input array: a 2D NumPy array representing the input data ˆ p: a dropout probability The function generates a binary mask with the same shape as the input, where each element is 1 with probability 1-p and 0 with probability p. The input is then multiplied element-wise by the mask, effectively dropping out random elements. The function re- turns a NumPy array representing the output of the dropout operation. Complete the missing part of the function using NumPy, without using any special- ized deep learning libraries so that the execution of the test script produces the required output. You may find the following function useful for this question. ˆ np.random.rand(d0, d1, . . ., dn) returns an array of the given shape and popu- late it with random samples from a uniform distribution over [0, 1). – d0, d1,. . ., dn (optional) represent the dimensions of the returned array, must be non-negative. If no argument is given, a single Python float is returned. import numpy as np def dropout(input_array, p): mask = #______ TODO ______ return input_array * mask', 13, 12, NULL::jsonb, NULL, NULL, 'Problem 6 [13 points] Dilated Convolution and Dropout Solution: (a) Dilated Convolution TODO # Answer ((kernel.shape[0] - 1) * dilation_rate) // 2 1.5 points (input_array.shape[0] - kernel.shape[0] * dilation_rate + 2 * pad) // stride + 1 1.5 points (input_array.shape[1] - kernel.shape[1] * dilation_rate + 2 * pad) // stride + 1 1.5 points np.arange(0, output_rows * stride, stride) + i * dilation_rate 1.5 points np.arange(0, output_cols * stride, stride) + j * dilation_rate 1.5 points kernel[i, j] * padded_input[np.ix_(input_row_indices, input_col_indices)] 2.5 points (b) Dropout (np.random.rand(input_array.shape[0],input_array.shape[1]) < p) / p # 3 points', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '7', NULL, 7, 'long_question', 'long_answer', 'Problem 7 [18 points] Convolutional Neural Network (a) [8 points] Consider the following Keras implementation of a Convolutional Neural Net- work: from keras.models import Sequential from keras.layers import Conv2D, MaxPooling2D from keras.layers import Dense, Flatten model = Sequential() model.add(Conv2D(filters=32, kernel_size=(5, 5), padding=''same'', activation=''relu'', input_shape=(32, 32, 3))) model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'', activation=''relu'')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'', activation=''relu'', strides=(2, 2))) model.add(Conv2D(filters=64, kernel_size=(3, 3), padding=''same'', activation=''relu'')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(units=128, activation=''relu'')) model.add(Dense(units=10, activation=''softmax'')) (i) What is the padding size (in terms of border width) of the first convolutional layer? (ii) Fill in the blanks in the answer book in the model.summary() of the network. Assume all convolutional layers have biases. Numbers shown as ? are kept secret and you may ignore them. Hint: When padding = “same” and strides = 2, the output has the half size of the input. (iii) Suppose you observe the following loss curves as you train the Convolutional Neural Network. What is the most likely problem with the model? When does the problem usually occur? Describe a common way to mitigate the problem and explain why it works (in a few sentences). (b) [5 points] What is the output of the following parts of this very tiny convolutional neural network? This network was trained to classify whether the input 4×4 image depicts the letter “L” or not. Stride is 1, padding size is 0 for all layers, and the pool size for the max pooling layer is (2, 2). Show your calculation step by step. You only need to keep 1 decimal number. ˆ ReLU activation function f(z) = max(0, x). ˆ Sigmoid activation function f(z) = 1+e−z , where e = 2.718. Note: Assume the given kernels have already been flipped. (i) Feature map after Max-Pooling (ii) Feature in the fully-connected layer (iii) Output (c) [5 points] In video processing, 3D convolution is particularly valuable due to its ability to capture temporal dynamics as well as spatial features, which is essential for understanding content across frames. Unlike 2D convolution, which only analyzes spatial patterns within a single frame, 3D convolution extends the analysis across multiple consecutive frames, treating time as the third dimension alongside height and width. This approach allows a Convolutional Neural Network (CNN) to not only perceive static features in individual frames but also to understand movements and changes over time, which are crucial for tasks such as action recognition, video classification, and anomaly detection in video streams. Suppose we have video data of shape (32, 400, 300, 3), corresponding to (#keyframes, width, height, #channels). In the first layer, we want to apply a 3D convolution with 100 kernels in the shape of (3, 3, 3), corresponding to (keyframe, width, height). The padding is (0, 2, 2), and the stride is (1, 2, 2). Please answer the following question with calculation steps or rationales. (i) What is the shape of the output after this layer? (ii) How many weight parameters are there? (iii) How many bias parameters are there?', 18, 16, NULL::jsonb, NULL, NULL, 'Problem 7 [18 points] Convolutional Neural Network Solution: (a) (i) 2 (ii) 16, 16, 64 8, 8, (iii) The model has overfit the training dataset. Overfitting occurs when a model is too complex and learns the noise in the training data rather than the underlying patterns. Dropout can be used to mitigate the problem. Dropout is a technique where, during training, some neurons in the network are randomly dropped out (i.e., their outputs are set to zero) with a certain probability, typically 0.2 or 0.5. This means that, at each training iteration, a different subset of neurons is randomly selected to be “dropped out.” Dropout helps prevent overfitting in several ways (full marks to any of the fol- lowing): ˆ Reducing capacity: By randomly dropping out neurons, the network’s capacity is reduced, making it less prone to overfitting. With fewer neurons, the network has fewer opportunities to memorize the training data. ˆ Forcing feature sharing: Dropout encourages feature sharing among neurons. When a neuron is dropped out, the network must rely on other neurons to make predictions, which promotes feature sharing and reduces overfitting. ˆ Preventing complex co-adaptations: Dropout breaks the complex co-adaptations between neurons, which can lead to overfitting. By randomly dropping out neu- rons, the network is forced to learn simpler, more generalizable representations. ˆ Improving generalization: Dropout can be seen as a form of data augmentation. By randomly dropping out neurons, the network is forced to generalize to new, unseen situations, which improves its ability to generalize to new data. ˆ Reducing the risk of over-reliance on a single neuron: Dropout prevents the network from relying too heavily on a single neuron or a small group of neurons. This reduces the risk of overfitting, as the network is forced to use multiple neurons to make predictions. ˆ Ensemble-like behavior: Dropout can be seen as an ensemble method, where multiple sub-networks are trained simultaneously. Each sub-network is a different subset of neurons, and the final prediction is an ensemble of these sub-networks. This ensemble-like behavior improves generalization and reduces overfitting. The answer is not unique. (b) Answer: (c) (i) For any dimension (let’s denote it generically as D): Output D = floor((Input D + 2 × Padding - Kernel D ) / Stride) + 1. Therefore, we have: Output keyframe = floor((32 + 2 * 0 - 3) / 1) + 1 = 30 Output width = floor((400 + 2 * 2 - 3) / 2) + 1 = 201 Output height = floor((300 + 2 * 2 - 3) / 2) + 1 = 151 The output shape is (30, 201, 151, 100). (ii) There are 100 kernels in the shape of (3, 3, 3). Therefore, the number of weight parameters is 1000 Ö 3 Ö 3 Ö 3 x 3 = 8100. (iii) There is 1 bias parameter per kernel. So the total biases is 100. Scheme: (a) (i) 1 point for giving the correct padding size. (ii) 1 point for giving each correct shape (3 shape values). 1 point for giving the correct number of parameters. 4 points in total. (iii) 1 point for stating the model has overfit the training dataset. 1 point for explaining what does the problem usually occur. 1 point for describing a way to mitigate the problem. 3 points in total. (b) 1 point for giving each correct feature map (2 feature maps). 1 point for giving each feature in the fully-connected layer (2 features). 1 point for giving the correct output. 5 points in total. (c) (i) 2 points for giving the correct output shape after this layer. (ii) 1.5 points for giving the correct number of weight parameters. (iii) 1.5 points for giving the correct total number of biases.', ARRAY['Vision and CNN']::TEXT[], 'Vision and CNN', 'Vision and CNN', ARRAY['Vision and CNN']::TEXT[], ARRAY['manual_computation', 'filter_computation', 'architecture_reasoning']::TEXT[], 'hard', '', '', ''), ('COMP2211-2024-spring-final', '8', NULL, 8, 'long_question', 'long_answer', 'Problem 8 [10 points] Minimax and Alpha-Beta Pruning (a) [5 points] The figure below shows a tree in a minimax game with two players. A B C D E F G H I J MIN MAX Terminal nodes Score (i) Calculate the best score for the non-terminal nodes with the minimax algorithm. (ii) Suppose we are using an alpha-beta pruning algorithm. Indicate which edge will be pruned. Gives the alpha-beta values when running. We denote the edge between nodes A and B as AB. (iii) How do you make a minimax algorithm to find the shortest path to victory? Explain in one sentence. (b) [5 points] Consider the non zero-sum generalization in which the sum of the two players’ utilities are not necessarily zero. Because player A’s utility no longer determines player B’s utility exactly, the leaf utilities are written as pairs (UA, UB), with the first and second component indicating the utility of that leaf to player A and player B respectively. In this generalized setting, player A seeks to maximize UA, the first component, while player B seeks to maximize UB, the second component. (i) Complete the table in the answer book by estimating the value (as pairs) at each of the internal node. Assume that each player maximizes their own utility. (ii) Is alpha-beta pruning still applicable in this case? Briefly explain why and provide an example. (Hint: you can think about the case where UA(s) = UB(s) for all nodes.)', 10, 18, NULL::jsonb, NULL, NULL, 'Problem 8 [10 points] Minimax and Alpha-Beta Pruning Solution: (a) (i) Answer: Nodes Score A B C D (ii) Answer: Edge Alpha Beta CH DJ (iii) Record depth information to distinguish paths. (b) (i) Answer: A (2,4) B (0,3) C (-1,3) D (1,1) E (0,-2) (ii) No. The values that the first and second player are trying to maximize are inde- pendent. Therefore, the principle for pruning in alpha-beta pruning—that a worse outcome for one player implies a better outcome for the other—no longer applies. For instance, in the case where UA(s) = UB(s) for all nodes, the problem reduces to searching for the max-valued leaf, which could appear anywhere in the tree. Scheme: (a) (i) 0.25 point for giving each correct numeric value. 1 point in total. (ii) 0.5 point for giving each correct edge/numeric value. 3 points in total. (iii) 1 point for giving a way to find the shortest path to victory. (b) (i) 0.5 point for giving each pair of values. 2.5 points in total. (ii) 0.5 point for stating “No”. 1 point for explaining why and 1 point for giving an example. 2.5 points in total.', ARRAY['Search and Games']::TEXT[], 'Search and Games', 'Search and Games', ARRAY['Search and Games']::TEXT[], ARRAY['tree_search', 'pruning', 'manual_tracing']::TEXT[], 'medium', '', '', ''), ('COMP2211-2024-spring-final', '9', NULL, 9, 'long_question', 'short_answer', 'Problem 9 [3 points] Ethics of Artificial Intelligence From what areas we should ensure the AI models in production in their organizations func- tion ethically? -------------------- END OF PAPER -------------------- /* Rough work */ /* Rough work */ /* Rough work */ /* Rough work */ /* Rough work */', 3, 19, NULL::jsonb, NULL, NULL, 'Problem 9 [3 points] Ethics of Artificial Intelligence Solution: ˆ Data Ethics ˆ Fair AI model (or avoiding AI model bias) ˆ AI model monitoring and maintenance. Scheme: ˆ 1 point for giving each area correctly. 3 points in total. -------------------- END OF PAPER --------------------', ARRAY['Ethics of AI']::TEXT[], 'Ethics of AI', 'Ethics of AI', ARRAY['Ethics of AI']::TEXT[], ARRAY['concept_explanation', 'argumentation', 'comparison']::TEXT[], 'easy', '', '', '') ) AS seed( source_exam_key, question_number, parent_question, display_order, question_type, question_format, question_text, score, page_number, options, correct_option, correct_answer, raw_answer_text, topics, topic_primary, analytics_topic, topic_tags, skill_tags, difficulty, knowledge_reminder, ai_hint, solution ) JOIN papers AS p ON p.source_exam_key = seed.source_exam_key AND p.source_kind = 'course_library' WHERE NOT EXISTS ( SELECT 1 FROM paper_questions q WHERE q.paper_id = p.id AND q.question_number = seed.question_number );