;;;; VERSION-SPACE.LISP January 1988, An implementation of the Version Space Algorithm ;;;; for incremental learning from examples ;;;; Copyright (c) 1988 by Raymond Joseph Mooney. This program may be freely copied, used, or ;;;; modified provided that this copyright notice is included in each copy of this code ;;;; and parts thereof. ;;;; This is a basic version space learning algorithm. In order to use it for any ;;;; representation of instances and generalizations, the user must define the ;;;; following functions: ;;;; ;;;; equal-generalizations(x ,y): Returns T iff x and y are equal generalizations. ;;;; match(generalization, instance): Returns T iff generalization matches instance. ;;;; more-general?(x, y): Returns T iff generalization x is strictly more general ;;;; than generalization y. ;;;; initialize-g: Returns initial set of most general generalizations. ;;;; specialize-against(generalization, Returns a list of minimal specializations of the ;;;; instance) given generalization which do not match the instance. ;;;; generalize-to(generalization, Returns a list of minimal generalizations of the ;;;; instance) given generalization which do match the instance. ;;;; ;;;; A sample set of these functions for nominal feature vectors represented as ordered lists ;;;; is included. A sample data set using this representation is in the file: FIGURE-DATA. (defvar *s*) ; *s* is the most specific set of generalizations (S) (defvar *g*) ; *g* is the most general set of generalizations (G) (defparameter *trace-vs* nil) ; produces trace if set to T. (defmacro trace-print (test-var &rest format-form) ;; Print using the format string only if test-var (usually a trace-* variable)is nonNIl `(if ,test-var (format t ,@format-form))) (defun version-space (examples) ;;; This function takes a list of examples where an example is a list whose first element ;;; is either + or - to indicate a positive or negative example, and whose second element ;;; is a description of that example (an instance). It uses the version-space algorithm to ;;; determine the sets of most specific and most general generalizations consistent with all of ;;; the examples. Returns T if concept converged. After execution, most specific and most ;;; general generalizations are available in the variables *s* and *g*, respectively. (setf *g* (initialize-g)) (let ((first-pos (assoc '+ examples)) (converged nil)) (setf examples (remove first-pos examples)) (setf *s* (list (second first-pos))) ;initialize S to first positive instance (trace-print *trace-vs* "~%~%Example: ~A" first-pos) (trace-print *trace-vs* "~%S= ~A ~%G= ~A" *s* *g*) ;; process each additional example incrementally (dolist (example examples) (trace-print *trace-vs* "~%~%Example: ~A" example) (cond ((eq (first example) '-) ;; if example is a negative example, ;; then remove any members of S which match its instance and update G (setf *s* (remove (second example) *s* :test #'reverse-match)) (update-g (second example))) ((eq (first example) '+) ;; if example is a positive example ;; then remove any members of G which do not match its instance and update S (setf *g* (remove (second example) *g* :test-not #'reverse-match)) (update-s (second example)))) (trace-print *trace-vs* "~%S= ~A ~%G= ~A" *s* *g*) (cond ((or (null *g*)(null *s*)) ;; if either S or G is empty then concept cannot be described (format t "~%Langauage is insufficient to describe the concept") (return nil)) ((and (eq (length *s*) 1) (eq (length *g*) 1) (equal-generalizations (first *s*)(first *g*)) (null converged)) ;; if the version space has just converged then say so ;; but continue checking remaining examples for consistency (format t "~%~%Convergence. Concept must be: ~A" (format-generalization (first *s*))) (setf converged t)))) (if converged t (if (and *s* *g*) (format t "~%~%Did not converge~%S= ~A~%G= ~A" (mapcar #'format-generalization *s*) (mapcar #'format-generalization *g*)))))) (defun update-g (instance) ;;; Specializes generalizations in G so that none match the given instance for a negative example. ;; For each generalization in G which matches the instance compute minimal specializations which do ;; not match and remove those which are not more general than (or equal to) some element of S. (setf *g* (mapcan #'(lambda (generalization) (if (match generalization instance) (remove-if-not #'(lambda (specialization) (member specialization *s* :test #'(lambda (a b) (or (more-general? a b) (equal-generalizations a b))))) (specializations-against generalization instance)) (list generalization))) *g*)) ;; Remove from G those elements which are more specific than some other element in G (setf *g* (clean-g *g*))) (defun clean-g (g) ;;; Remove from the list of generalizations g, any gens which are more specific or equal to some ;;; other gen in g. (dolist (generalization1 g) (dolist (generalization2 (rest (member generalization1 g))) (cond ((more-specific? generalization2 generalization1) (setf g (remove generalization2 g))) ((or (more-specific? generalization1 generalization2) (equal-generalizations generalization1 generalization2)) (setf g (remove generalization1 g)))))) g) (defun update-s (instance) ;;; Generalizes generalizations in S so that all match the given instance for a positive example ;; For each generalization in S which doesn't match the instance compute minimal generalizations ;; which do match and remove those which are not more specific than (or equal to) some element of G. (setf *s* (mapcan #'(lambda (generalization) (if (not (match generalization instance)) (remove-if-not #'(lambda (generalized) (member generalized *g* :test #'(lambda (a b) (or (more-specific? a b) (equal-generalizations a b))))) (generalizations-to generalization instance)) (list generalization))) *s*)) ;; Remove from S those elements which are more general than some other element of S (setf *s* (clean-s *s*))) (defun clean-s (s) ;;; Remove from the list of generalizations s, any gens which are more general or equal to some ;;; other gen in g. (dolist (generalization1 s) (dolist (generalization2 (rest (member generalization1 s))) (cond ((more-general? generalization2 generalization1) (setf s (remove generalization2 s))) ((or (more-general? generalization1 generalization2) (equal-generalizations generalization1 generalization2)) (setf s (remove generalization1 s)))))) s) (defun reverse-match (instance generalization) ;;; Matches instance to generalization instead of vice-versa (match generalization instance)) (defun more-specific? (a b) ;;; Returns T iff generalization a is strictly more specific than generalization b (more-general? b a)) ;;;;================================================================================================= ;;;; Functions specific to nominal feature vectors represented as ordered lists of feature values. ;;;;================================================================================================= (defvar *domains*) ; an ordered list specifying the domain of each feature (defvar *feature-names*) ; an ordered list of the names for each feature (defparameter *print-with-feature-names* nil) ; print out generalizations with feature names (defun equal-generalizations (x y) ;;; Equivalence function for simple feature vector representation (equal x y)) (defun match (generalization instance) ;;; Match function for a simple feature vector representation where "?" is a wildcard (or (equal generalization instance) (and (or (equal (first generalization)(first instance)) (eq (first generalization) '?)) (match (rest generalization) (rest instance))))) (defun more-general? (x y) ;;; Returns T iff generalization x is strictly more general than generalization y for ;;; a simple feature vector representation. For x to be more general than y, they must match ;;; and x must have a "?" where y has a specific value; however y must never have a "?" where ;;; x has a specific value (cond ((or (null x)(null y)) nil) ((and (eq (first x) '?) (not (eq (first y) '?)) (or (equal (rest x)(rest y)) (more-general? (rest x) (rest y)))) t) ((equal (first x) (first y)) (more-general? (rest x) (rest y))))) (defun initialize-g () ;;; Initialize G to a set containing the all "?" feature vector (list (mapcar #'(lambda (feature) (declare (ignore feature)) '?) *domains*))) (defun generalizations-to (generalization instance) ;;; Generalizes the given generalization in all ways just enough to match the instance. ;;; For simple feature vectors there is only one possible least generalization in which ;;; differing feature values are changed to "?"s (list (if (or (null generalization)(null instance)) nil (cons (if (equal (first generalization)(first instance)) (first generalization) '?) (first (generalizations-to (rest generalization)(rest instance))))))) (defun specializations-against (generalization instance) ;;; Specializes the given generalization in all ways just enough so it doesn't match the instance. ;;; For simple feature vectors, for each "?" in the generalization there is a least specialization ;;; for each possible value in the domain for that feature which is different from the value ;;; in the instance. (do ((gen-rest generalization (rest gen-rest)) (inst-rest instance (rest inst-rest)) (gen-bef nil (append gen-bef (list (first gen-rest)))) (domain-rest *domains* (rest domain-rest)) (specializations nil)) ((or (null gen-rest)(null inst-rest)) specializations) (if (eq (first gen-rest) '?) (setf specializations (append (mapcar #'(lambda (value) (append gen-bef (list value) (rest gen-rest))) (remove (first inst-rest) (first domain-rest))) specializations))))) (defun format-generalization (gen) ;;; Format a generalization into a prettier form for output ;;; ( (feature-name value) ...) for features with constrained values (if *print-with-feature-names* (do ((gen-rest gen (rest gen-rest)) (feature-rest *feature-names* (rest feature-rest)) (formated-gen nil)) ((null gen-rest) (nreverse formated-gen)) (unless (eq (first gen-rest) '?) (push (list (first feature-rest) (first gen-rest)) formated-gen))) gen)) (defun make-domains (levels-list) ;;; If features values are simply integers 0 to n then the number of values of a ;;; feature is sufficient for determining its domain. This function creates a ;;; list suitable for *domains* given a list of the number of values for each feature. ;;; See the file SOYBEAN-RDATA for a sample use. (mapcar #'(lambda (levels) (let ((domain nil)) (dotimes (i levels domain) (setf domain (nconc domain (list i)))))) levels-list)) (defun check-examples (examples) ;;; Checks a list of examples to make sure they have the correct number of features ;;; and that each feature value is a member of the domain for that feature. Prints ;;; an error message for each error. (do ((rest-examples examples (cdr rest-examples)) (instance nil) (example# 1 (1+ example#))) ((null rest-examples) nil) (if (member(first (first rest-examples)) '(+ -)) (setf instance (second (first rest-examples))) (setf instance (first rest-examples))) (unless (eq (length instance) (length *domains*)) (format t "~%Example ~D has the wrong number of features" example#)) (do ((rest-features instance (rest rest-features)) (feature# 1 (1+ feature#)) (rest-domains *domains* (rest rest-domains))) ((null rest-features) nil) (unless (member (first rest-features) (first rest-domains)) (format t "~%~%Error in feature ~D of example ~D" feature# example#) (format t "~%~A not a member of ~A" (first rest-features) (first rest-domains)))))) ;;;; ========================================================================================== ;;;; Functions for running and testing a single concept ;;;; ========================================================================================== (defun vs-test (examples &optional train#) ;;; Run and test version-spaces on the examples by using the first train# examples ;;; to train and the remaining to test (if (null train#) (setf train# (length examples))) (let ((training-examples (subseq examples 0 train#)) (testing-examples (subseq examples train#)) (start-time (get-internal-run-time))) (version-space (positive-first training-examples)) (format t "~%~%Run time: ~,2Fs" (seconds-since start-time)) (test-examples testing-examples))) (defun positive-first (examples) ;;; Put positive examples all first for efficiency with nominal feature vectors (append (remove-if-not #'(lambda (example) (eq (first example) '+)) examples) (remove-if-not #'(lambda (example) (eq (first example) '-)) examples))) (defun test-examples (examples) (when examples (let ((correct# 0)) (dolist (example examples) (let ((match (determine-match *s* *g* (second example)))) (if (or (and match (eq (first example) '+)) (and (not match) (eq (first example) '-))) (incf correct#)))) (format t "~%~%Percentage correct: ~A" (round (* 100 (/ correct# (length examples)))))))) (defun determine-match (s g instance) ;;; This sample DETERMINE-MATCH functions considers that an instance is in a category ;;; if it matches the majority of the generalizations in S and G. (let ((count 0)) (dolist (generalization (append s g)) (if (match generalization instance) (incf count))) (> count (/ (+ (length s) (length g)) 2)))) (defun check-consistency (examples) ;;; Checks the consistency of each generalization in S and G with the given list of examples. ;;; Prints an error message for each error found. Can be used to check correctness after ;;; running version-space on a list of examples. (dolist (generalization (append *s* *g*)) (dolist (example examples) (if (and (eq (first example) '-)(match generalization (second example))) (format t "~%~%Error: ~A matches ~A" generalization example)) (if (and (eq (first example) '+)(not (match generalization (second example)))) (format t "~%~%Error: ~A doesn't match ~A" generalization example))))) ;;;; ========================================================================================== ;;;; The following functions are for multiple concept (category) problems like the soybean data ;;;; ========================================================================================== (defun vs-categories (category-list) ;;; Version-space for multiple concept learning problems. The argument category-list ;;; should be a list of atoms which represent names of individual categories. A list of ;;; instances for learning should be stored on the LEARN-INSTANCES property of each ;;; category name. A single concept learning trial is run for each category in which the ;;; instances of that category are positive examples and instances of all other categories ;;; are negative examples. If convergence is reached for a given category, then the ;;; correct generalization is stored on the GENERALIZATION property of the category name. ;;; Otherwise the S and G sets are stored under the properties S-GENERALIZATIONS and ;;; G-GENERALIZATIONS. (let ((start-time (get-internal-run-time))) (dolist (category-name category-list) (format t "~%~%Category: ~A" category-name) (if (version-space (make-examples (get category-name 'learn-instances) (mapcan #'(lambda (a) (copy-list (get a 'learn-instances))) (remove category-name category-list)))) (setf (get category-name 'GENERALIZATION) (first *s*)) (progn (setf (get category-name 'GENERALIZATION) nil) (setf (get category-name 'S-GENERALIZATIONS) *s*) (setf (get category-name 'G-GENERALIZATIONS) *g*)))) (format t "~%~%Run time: ~,2Fs" (seconds-since start-time))) (vs-test-categories category-list)) (defun make-examples (pos-instances neg-instances) ;;; Converts lists of positive and negative instances into a list of examples ;;; suitable for VERSION-SPACE. (append (mapcar #'(lambda (instance) (list '+ instance)) pos-instances) (mapcar #'(lambda (instance) (list '- instance)) neg-instances))) (defun separate-instances (categories num-learn-instances) ;;; Separates a list of instances for a set of categories into learning and testing ;;; instances to facilitate experimentation in preparation for using VS-CATEGORIES ;;; and TEST-CATEGORIES. The variable categories should be bound to a list of ;;; category names whose values are a list of instances of that category. The first ;;; num-learn-instances of these instances are stored on the LEARN-INSTANCES property ;;; to be used for learning and the rest are stored on TEST-INSTANCES for testing. (dolist (category categories) (setf (get category 'learn-instances) (subseq (eval category) 0 num-learn-instances)) (setf (get category 'test-instances) (subseq (eval category) num-learn-instances (length (eval category)))))) (defun vs-test-categories (categories &optional learn-instances?) ;;; To be used after VS-CATEGORIES in order to test the generalizations learned on a ;;; set of new instances stored under the TEST-INSTANCES property of each category name ;;; in categories. Reports % correct for each category and overall % correct. (let ((percent-sum 0)(percent 0)) (dolist (category categories) (format t "~%~%Testing ~A instances" category) (let ((answers (mapcar #'(lambda (instance) (test-instance instance categories)) (get category (if learn-instances? 'learn-instances 'test-instances)))) (count 0)) (format t "~%~A" answers) (dolist (answer answers) (if (and (eq (first answer) category)(null (rest answer))) (incf count))) (setf percent (* 100 (/ count (length answers)))) (incf percent-sum percent) (format t "~%Percentage correct: ~A" (round percent)))) (format t "~%~%Total percent correct: ~A" (round (/ percent-sum (length categories)))))) (defun test-instance (instance categories) ;;; Given an instance and a list of category names returns the subset of these categories ;;; which it is determined that the instance belongs to. If the generalization learned ;;; for a category converged then just see if it matches it to determine whether or not ;;; the instance is in the category. Otherwise use the function DETERMINE-MATCH to ;;; decide whether or not the instance is in the category based on the S and G sets for ;;; the category. (let ((member-categories nil)) (dolist (category categories member-categories) (if (or (and (get category 'GENERALIZATION) (match (get category 'GENERALIZATION) instance)) (determine-match (get category 'S-GENERALIZATIONS) (get category 'G-GENERALIZATIONS) instance)) (push category member-categories))))) ;;;; ========================================================================================== ;;;; General utility functions ;;;; ========================================================================================== (defun seconds-since (time) ;;; Return seconds elapsed since given time (initially set by get-internal-run-time) (/ (- (get-internal-run-time) time) internal-time-units-per-second))