-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcreateMLSTlookup.m
196 lines (166 loc) · 6.58 KB
/
createMLSTlookup.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
function createMLSTlookup(varargin)
%% DESCRIPTION:
%
% This function creates a suitable lookup table for MLST stimuli. This
% uses the provided MLST word list (sent to CWB by Wu) to create
% something closer to the HINT lookup table that is more robust.
%
% INPUT:
%
% IN: path to XLSX spreadsheet with MLST sentence, talker, and keyword
% information in it
%
% OUT: path to XLSX spreadsheet that will serve as our lookup table
%
%
% Parameters:
%
% 'suffix': suffix to append to file names and XLSX output file
%
% 'testID': string, testID used to lookup most appropriate test
% information
%
% 'wav_regexp': regular expression to overwrite .specific.wav_regexp.
%
% Notes on file name formatting:
%
% MLST filenames follow the format of :
% SentencePositionWithinlist#_Talker#_Sentence#_LexicalCategory.FileExtension
% (MP3/MP4)
%
% As a regular expression, this looks like:
%
% '[0-9]{1,2}_T[0-9]{1,2}_[0-9]{3}_[HL][DS].mp3$'
%
% So a lot of the information we need to create the lookup table is
% present in the filename itself.
%
%
% Notes on Lookup Table format:
%
% Here are the fields that are necessary in the resulting lookup table:
%
% ID: a short-hand ID used to identify the list and sentence number
% (e.g., 01_05 corresponds to list 01 and sentence 05).
%
% REQUIRED FIELDS (to match HINT)
%
% Filename: contains the directory (e.g., List_01) and filename
% information WITHOUT the file extension. This makes it
% possible to use the same lookup table for MP3 and MP4
% format types.
%
% Ex.: 'List_01\1_T5_046_HS'
%
% Legend: Sentence content. Keywords are marked using capital letters.
%
% Ex. 'if you KICK the TAP it will RUN'
%
% Scoring units: The number of keywords in the sentence
%
% ADDITIONAL (potentially useful) FIELDS
%
% Talker: The talker ID (ranges from 1 - 10, see "Sheet2")
%
% SentenceNum: The sentence number (see second column of "Sheet1"
%
% LexCat: Lexical category (see column 1 of "Sheet1")
%
% Development:
%
% XXX
%
% Christopher W. Bishop
% University of Washington
% 6/14
%% GET PARAMETERS
d=varargin2struct(varargin{:});
%% GET AN APPROPRIATE OPTIONS SET
% Use this to query lists and files below. We'll use most of the
% informstion to generate most of the content we need.
opts=SIN_TestSetup(d.testID, '');
%% OVERWRITE FILE FILTER IF NECESSARY
if isfield(d, 'wav_regexp')
input(['Overwriting ' opts.specific.wav_regexp ' with ' d.wav_regexp '. Press enter to continue']);
% Error check to make sure we haven't changed the field name to
% something else.
if ~isfield(opts.specific, 'wav_regexp')
error('wav_regexp field name may have changed');
end %
opts.specific.wav_regexp = d.wav_regexp;
end % isfield
%% GET FILENAMES
% Use the filenames to generate most of the content
[list_dir, wavfiles] = SIN_stiminfo(opts);
%% LOAD SHEET 1
% This contains the sentence text and target words. Also contains
% sentence #, which will be used below to get talker information to
% create filename information
[~,t,r]=xlsread(fullfile(opts.specific.root, 'MLST Sentence Lists.xlsx'), 1);
ID = {'ID'};
FilePath={'File Path'};
Sentence={'Legend'};
ScoringUnits = NaN;
for i=1:numel(wavfiles)
for w=1:numel(wavfiles{i})
% Breakdown to file parts
[PATHSTR,NAME,EXT] = fileparts(wavfiles{i}{w});
% Now break down file name into information we need
C = strsplit(NAME, '_');
% SentenceList#_Talker#_Sentence#_LexicalCategory.FileExtension
SentList = str2num(C{1});
Talker = C{2};
SentNum = str2num(C{3});
LexCat = C{4};
% Find sentence information from XLS file
% Note: the same sentence might be used more than once, so just
% grab the first instance of the sentence since the keywords will
% be the same in all sentences.
ind = find([r{2:end,2}]==SentNum);
ind=ind(1) + 1 ; % add 1 back in to account for header being ommitted.
% Get key words
keywords = {r{ind, 3:5}};
% Get full sentence
% Make everything lower case. Below we'll capitalize the keywords
% for scoring purposes.
sentence = lower(r{ind, 6});
% CWB encountered a special case with the word "grime-stained",
% "one-toothed", and "three-legged". These hyphenated words were
% not being flagged as "keywords" in the scoring GUI. This was
% because GRIME is the keyword while stained is not. So half of the
% word is a keyword ... odd. Anyway, CWB "fixed" this by replacing
% the hyphens with a space. This splits the hyphenated words into
% two words and the to-be-scored word should be flagged
% appropriately in the scoring GUI.
sentence = strrep(sentence, '-', ' ');
%
% sentence = strrep(sentence, '.', '');
sentence = SIN_removepunctuation(sentence);
% Make SentenceLegend (SentLeg)
for k=1:numel(keywords)
% Find the key word
ind = strfind(sentence, keywords{k});
% Capitalize it (for keyword scoring)
sentence(ind:ind + numel(keywords{k})-1) = upper(keywords{k});
end % for k=1:numel(keywords)
% Put all the information together into a format that's easy to
% write to file
% Make sure we have at least two digits for list # and file number.
ID{end+1,1} = sprintf('%02d_%02d', i, SentList);
% Note that the file extension (.mp3 or .mp4) is omitted. This
% makes data lookup easier down the road (CWB thinks, at least).
% Leave out the file path information
% FilePath{end+1, 1} = fullfile(PATHSTR(end-6:end), NAME);
FilePath{end+1, 1} = fullfile(PATHSTR(end-6:end), [NAME]);
Sentence{end+1, 1} = sentence;
% This is a sloppy line of code that led to CWB missing some
% keyword flagging issues. Silly CWB. Not sure the small payoff is
% worth the work it would take to do it differently ... backburner.
ScoringUnits(end+1, 1) = numel(keywords); % this should always be 3 for MLST
% FilePath(end+1, 1
end % w=1:numel(wavfiles{i})
end % for i=1:numel(wavfiles)
% Create a table
t = table(ID, FilePath, Sentence, ScoringUnits);
% Write table to XLSX file
writetable(t, fullfile(opts.specific.root, ['MLST (Adult)' d.suffix '.xlsx']));