-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathpopbrowser.m
421 lines (341 loc) · 12.7 KB
/
popbrowser.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
function popbrowser(gp,dataset,ID,complexityType,logR2)
%POPBROWSER Visually browse complexity and performance characteristics of a population.
%
% POPBROWSER(GP) shows a plot of the current population on the axes
% fitness vs complexity. The Pareto front is plotted as a series of green
% circles. Non Pareto front individuals are shown as blue circles. The
% best individual (as evaluated on the training data) is highlighted with
% a red circle.
%
% For multigene symbolic regression, POPBROWSER(GP) shows a scatterplot
% of 1 - R^2 (coefficient of determination) vs expressional complexity.
% By default 1 - R^2 is calculated on the training data.
%
% Clicking on a circle reveals the numeric population identifier ID of
% the corresponding GP individual(s) and, if a multigene regression
% model, a simplified overall model equation (to 2 digits of precision
% using the 'fast' MuPAD simplification mode).
%
% Additionally, for multigene regression models:
%
% To specify a different data set to compute 1 - R^2 on, use
%
% POPBROWSER(GP,DATASET) where DATASET can equal 'train','val' or 'test'.
%
% POPBROWSER(GP,DATASET,ID) adds a magenta dot to represent a user
% supplied model identifier ID, which can be a numeric model ID of a
% multigene model in the GP population or the 'best', 'valbest' or
% 'testbest' model. ID can also be a struct representing a multigene
% regression model generated by GPMODEL2STRUCT or GENES2GPMODEL. This is
% useful for examining the performance of user tailored models when used
% in conjunction with UNIQUEGENES and GENEBROWSER.
%
% The plotted tree complexity is by default 'expressional complexity'
% even if the run was performed using 'node count' as a measure of
% complexity. However, either measure can be displayed by setting
% COMPLEXITYTYPE to 1 (expressional) or 0 (node count) using:
%
% POPBROWSER(GP,DATASET,ID,COMPLEXITYTYPE) where ID may be set to empty
% ([]) if you don't want to plot a user supplied model.
%
% To plot a log Y-axis use:
%
% POPBROWSER(GP,DATASET,ID,COMPLEXITYTYPE,LOGR2) with LOGR2 = TRUE. This
% gives better visual resolution between high performance models.
%
% Copyright (c) 2009-2015 Dominic Searson
%
% GPTIPS 2
%
% See also SUMMARY, RUNTREE, GPMODELREPORT, PARETOREPORT, GPMODELFILTER,
% GENEBROWSER, GENES2GPMODEL, UNIQUEGENES
if nargin < 1
disp('Basic usage is POPBROWSER(GP)');
return;
end
if nargin < 2 || isempty(dataset)
dataset = 'train';
end
if nargin < 3
ID = [];
end
if nargin < 4 || isempty(complexityType)
complexityType = 1;
end
if nargin < 5 || isempty(logR2)
logR2 = false;
end
if ischar(complexityType) || ischar(logR2)
error('complexityType and logR2 parameters must not be strings.');
end
if complexityType < 0 || complexityType > 1
error('Complexity type must be 0 = node count or 1 = expressional.');
end
if gp.runcontrol.pop_size > 750
disp('Please wait, performing Pareto sort of population ...');
end
browserFig = figure('visible','off'); set(browserFig,'name','GPTIPS 2 Population browser');
ax1 = gca; set(ax1 ,'box','on')
if ~isempty(gp.userdata.name)
setname = ['Data: ' gp.userdata.name];
else
setname = '';
end
%string for figure title
mergeStr = '';
if gp.info.merged && gp.info.filtered
mergeStr = ' (merged & filtered)';
elseif gp.info.merged
mergeStr = ' (merged)';
elseif gp.info.filtered
mergeStr = ' (filtered)';
end
%multigene regression
mgmodel = false;
if strncmpi(func2str(gp.fitness.fitfun),'regressmulti',12);
mgmodel = true;
%data set options
if strcmpi(dataset,'train')
yvals = 1 - gp.fitness.r2train;
ylabelContent = '1-R^2 (training)';
yvalBest = 1 - gp.results.best.r2train;
elseif strcmpi(dataset,'val')
if isfield(gp.fitness,'r2val')
yvals = 1 - gp.fitness.r2val;
ylabelContent = '1-R^2 (validation)';
yvalBest = 1 - gp.results.best.r2val;
else
error('No validation data was found.');
end
elseif strcmpi(dataset,'test')
if isfield(gp.fitness,'r2test')
yvals = 1 - gp.fitness.r2test;
ylabelContent = '1-R^2 (testing)';
yvalBest = 1 - gp.results.best.r2test;
else
error('No test data was found.');
end
else
error('The specified data set must be ''train'',''val'' or ''test''.');
end
%plot all models' 1-R2
if complexityType
bluedots = plot(ax1,gp.fitness.complexity,yvals,'o');
else
bluedots = plot(ax1,gp.fitness.nodecount,yvals,'o');
end
set(bluedots,'markeredgecolor','none','markerfacecolor',[0 0.45 0.74]);
hold on;
%if user supplied gpmodel is mg regression model struct then plot that
%in magenta
if ~isempty(ID)
%user supplied multigene regression model structure
if isa(ID,'struct')
if ~(isfield(ID,'expComplexity') && isfield(ID,'numNodes') )
close(browserFig);
error('Invalid multigene regression model structure supplied as ID');
end
%plot model with supplied numeric population index
elseif isnumeric(ID) && numel(ID) == 1
if ID > gp.runcontrol.pop_size || ID < 1
close(browserFig);
error('Supplied population index is invalid.');
end
ID = gpmodel2struct(gp,ID,false,false,true);
elseif ischar(ID) && strcmpi(ID,'best')
ID = gpmodel2struct(gp,'best',false,false,true);
elseif ischar(ID) && strcmpi(ID,'valbest')
ID = gpmodel2struct(gp,'valbest',false,false,true);
if ~ID.valid
error('No validation data was found.');
end
elseif ischar(ID) && strcmpi(ID,'testbest')
ID = gpmodel2struct(gp,'testbest',false,false,true);
if ~ID.valid
error('No test data was found.');
end
else %unrecognised
close(browserFig);
error('Invalid model identifier supplied.');
end
end
%highlight models on the pareto front with green circles
if complexityType
xrank = ndfsort_rank1([yvals gp.fitness.complexity]);
greendots = plot(ax1,gp.fitness.complexity(xrank==1),yvals(xrank==1),'o');
else
xrank = ndfsort_rank1([yvals gp.fitness.nodecount]);
greendots = plot(ax1,gp.fitness.nodecount(xrank==1),yvals(xrank==1),'o');
end
set(greendots,'markerfacecolor','green','markeredgecolor',[0.25 0.25 0.25]);
gp.fitness.values = yvals; %for use with datacursor
%plot supplied model
if ~isempty(ID)
plotmodeldot = true;
if strcmpi(dataset,'train') && ~ID.train.warning
modeldotYval = 1 - ID.train.r2;
elseif strcmpi(dataset,'val') && ~ID.val.warning
modeldotYval = 1 - ID.val.r2;
elseif strcmpi(dataset,'test') && ~ID.test.warning
modeldotYval = 1 - ID.test.r2;
else %cannot plot this model on this data set
plotmodeldot = false;
end
if plotmodeldot
if complexityType
modeldot = plot(ax1,ID.expComplexity,modeldotYval,'mo','linewidth',1,'markersize',8);
else
modeldot = plot(ax1,ID.numNodes,modeldotYval,'mo','linewidth',1,'markersize',8);
end
set(modeldot,'markerfacecolor','magenta','markeredgecolor','black');
end
end
%plot "best" model found on training data circled in red
if complexityType
bestComplexity = gp.results.best.complexity;
else
bestComplexity = gp.results.best.nodecount;
end
plot(ax1,bestComplexity,yvalBest,'ro','linewidth',2,'markersize',8);
grid on; ylabel(ax1,ylabelContent);
%for R2, always set y-axis between 0 and 1
set(ax1,'Ylim',[0 1]);
if complexityType
xlabel(ax1,'Expressional complexity');
else
xlabel(ax1,'Number of nodes');
end
hold off;
title(ax1,{['Population' mergeStr ' models = ' num2str(gp.runcontrol.pop_size)],...
setname},'interpreter','none','FontWeight','bold');
%change y axis if log (1-R^2) vals required
if logR2
set(ax1,'Yscale','log');
set(ax1,'Ylimmode','auto');
end
else %for other fitness functions, plot raw training fitness values
if complexityType
bluedots = plot(ax1,gp.fitness.complexity,gp.fitness.values,'o');
else
bluedots = plot(ax1,gp.fitness.nodecount,gp.fitness.values,'o');
end
set(bluedots,'markeredgecolor','none','markerfacecolor',[0 0.45 0.74]);
hold on; grid on;
%find 'best' on training data
best_fit = gp.results.best.fitness;
if complexityType
bestComplexity = gp.results.best.complexity;
else
bestComplexity = gp.results.best.nodecount;
end
%plot 'best' on training data
plot(ax1,bestComplexity,best_fit,'ro','linewidth',2,'markersize',8);
ylabel(ax1,gp.fitness.label);
if complexityType
xlabel(ax1,'Expressional complexity');
else
xlabel(ax1,'Number of nodes');
end
%highlight individuals on the pareto front with green circles
if gp.fitness.minimisation
mo = 1;
else
mo = -1;
end
if complexityType
xrank = ndfsort_rank1([(mo * gp.fitness.values) gp.fitness.complexity]);
greendots = plot(ax1,gp.fitness.complexity(xrank == 1),gp.fitness.values(xrank == 1),'o');
else
xrank = ndfsort_rank1([(mo * gp.fitness.values) gp.fitness.nodecount]);
greendots = plot(ax1,gp.fitness.nodecount(xrank == 1),gp.fitness.values(xrank == 1),'o');
end
set(greendots,'markerfacecolor','green','markeredgecolor',[0.25 0.25 0.25]);
hold off;
title(ax1,{['Population = ' mergeStr num2str(gp.runcontrol.pop_size)],...
setname},'interpreter','none','FontWeight','bold');
end
gp.complexityType = complexityType;
grid on; set(browserFig,'userdata',gp); set(browserFig,'numbertitle','off'); set(browserFig,'visible','on');
%enable datacursor mode
dcManager = datacursormode(gcf);
if mgmodel && gp.info.toolbox.symbolic
set(dcManager,'UpdateFcn',@disp_mgmodel);
else
set(dcManager,'UpdateFcn',@disp_indiv);
end
set(dcManager,'SnapToDataVertex','on');
set(dcManager,'enable','on');
drawnow;
function txt = disp_indiv(~,event_obj)
%returns population member ID to datacursor.
if verLessThan('Matlab','8.4')
gp = get(gcbf,'userdata'); %appears not to work in 2014b
else
gp = get(gcf,'userdata'); %workaround til this is fixed
end
a = get(event_obj);
b = get(a.Target);
if strcmp(b.Type,'line')
comp = a.Position(1);
fitness = a.Position(2);
%locate in population
fitInd = find(gp.fitness.values==fitness);
if gp.complexityType
compInd = find(gp.fitness.complexity==comp);
else
compInd = find(gp.fitness.nodecount==comp);
end
ind = intersect(fitInd,compInd);
numInds = numel(ind);
txt = cell(numInds+1,1);
txt{1} ='Individual ID: ';
for i=1:numInds
txt{i+1} = int2str(ind(i));
end
else
txt = '';
end
function txt = disp_mgmodel(~,event_obj)
%returns multigene regression model info to datacursor.
if verLessThan('Matlab','8.4')
gp = get(gcbf,'userdata'); %appears not to work in 2014b
else
gp = get(gcf,'userdata'); %workaround til this is fixed
end
a = get(event_obj);
b = get(a.Target);
if strcmp(b.Type,'line')
complexity = a.Position(1);
fitness = a.Position(2);
%locate in population
fitInd = find(gp.fitness.values==fitness);
if gp.complexityType
compInd = find(gp.fitness.complexity==complexity);
else
compInd = find(gp.fitness.nodecount==complexity);
end
ind = intersect(fitInd,compInd);
numInds = numel(ind);
if numInds > 0
if numInds > 10
disp('Multiple matching models: only displaying first 5.');
ind = ind(1:5);
numInds = 5;
end
txt = cell(numInds+1,2);
txt{1,1} ='Individual ID: ';
txt{1,2} ='Model: ';
for i=1:numInds
txt{i+1,1} = int2str(ind(i));
try
txt{i+1,2} = char(vpa(gpmodel2sym(gp,ind(i),true),2)); %only display 2 chars of precision
catch
txt{i+1,2} = 'Invalid model';
end
end
else
txt = {'Model not found in population.'}; %e.g. valbest frequently isn't in final population
end
else
txt = '';
end