function  [F,dur,f0] = load_training_data(fln)
%
%  function:  [F,dur,f0] = load_training_data(fln);
%
%  Load training data from file `fln`. Each line of this text file
%  contains the following information: 
%    <phone_id> <duration> <fundamental freq> <sentence#> <syllable#> 
%    <phase#> <syllable size> <syllable part> <language> <stress level> 
%    <phrase type> <phrbnd>
% 
%  Input:
%    fln = name of training data file 
%
%  Output:   
%    dur = phone duration
%    f0  = fundamental frequency  
%    F   = factors struct with the following fields:
%          .phone_id:         phone identifier in ETHPA notation 
%          .phone_number:     the phone number (position) within a syllable    
%          .syllable_size:    the syllable size in number of phones
%          .phone_position:   position of the phone within its syllable:
%                             'a':  in syllable onset (Ansatz)
%                             'n':  in syllable nucleus (Nukleus)
%                             'k':  in syllable coda (Koda)
%          .accent:           accent or stress level of syllable: 0 ... 4
%                             0:  no stress 
%                             1:  main stress
%                             2:  secondary stress
%                             4:  weakest stress
%          .syllable_number:  syllable number within phrase
%          .phrase_size:      number of syllables in phrase
%          .phrase_type:      phrase type:  'P' for progressive 
%                                           'T' for terminal (sentence-final)
%          .phrase_boundary:  phrase boundary after current syllable:        
%                             '-':  no phrase boundary
%                             '2':  weak phrase boundary
%                             '1':  strong phrase boundary
%                             '0':  sentence boundary

elemnr = 12;  % number elements per line 
  
C = [];

if exist(fln) == 0,
  error(['*** training data file "' fln '" not found!']);
end

fid = fopen(fln,'r');
if fid <= 0
  error(['*** cannot open file "' fln '"']);
end

% read entire text (for speed-up)
t1 = char(fread(fid))';
fclose(fid);

% replace special symbols
t = regexprep(t1,'[\f\n\r\t]',' '); 

S = zeros(1,size(t,2));
F = S;
ind = 1;
tst = 1;
tend = 1000;   % apply regexp only to 1000 chars at once (speedup)
lastp = 0;
stop = 0;
while stop == 0,
  if tend > size(t,2)
    tend = size(t,2);
    stop = 1;
  end
  [sps,spf] = regexp(t(tst:tend),' ');  % search last ' '
  tend = tst + spf(end) - 1;
  [s,f] = regexp(t(tst:tend),'[^ ]*');
  lastf = size(f,2);
  S(ind:ind+lastf-1) = s(1:lastf) + lastp;
  F(ind:ind+lastf-1) = f(1:lastf) + lastp;
  lastp = F(ind+lastf-1);
  tst = F(ind+lastf-1) + 1;
  tend = tst + 1000;
  ind = ind + lastf;
end
S = S(S>0);
F = F(F>0);
if size(S,2) ~= size(F,2)
  error('*** Error reading tokens of prosody corpus!');
end
if mod(size(S,2),elemnr) ~= 0,
  error('*** Error reading tokens of prosody corpus!');
end

C.phone_id = cell(size(S,2)/elemnr,1);   % create C structure
dur = zeros(size(S,2)/elemnr,1);
f0 = dur;
C.phone_number = dur;
C.syllable_size = dur;
C.phone_position = C.phone_id;
C.accent = dur;
C.syllable_number = dur;
C.phrase_size = dur;
C.phrase_type = C.phone_id;
C.phrase_boundary = C.phone_id;

D.sentence_number = dur;
D.language = C.phone_id;

i = 1;
tok = 1;
while tok <= size(S,2),
  C.phone_id(i) = {t(S(tok):F(tok))}; tok = tok+1;
  dur(i) = str2double(t(S(tok):F(tok))); tok = tok+1;
  f0(i) = str2double(t(S(tok):F(tok))); tok = tok+1;
  D.sentence_number(i) = str2double(t(S(tok):F(tok))); tok = tok+1; 
  D.syllable_number_sentence(i) = str2double(t(S(tok):F(tok))); tok = tok+1; % syllable_number as in .dat file: syllable number per sentence
  C.phone_number(i) = str2double(t(S(tok):F(tok))); tok = tok+1;
  C.syllable_size(i) = str2double(t(S(tok):F(tok))); tok = tok+1; 
  C.phone_position(i) = {t(S(tok):F(tok))}; tok = tok+1;
  D.language(i) = {t(S(tok):F(tok))}; tok = tok+1;
  C.accent(i) = str2double(t(S(tok):F(tok))); tok = tok+1;
  C.phrase_type(i) = {t(S(tok):F(tok))}; tok = tok+1;
  C.phrase_boundary(i) = {t(S(tok):F(tok))}; tok = tok+1;
  i = i+1;
end

C.phone_id = char(C.phone_id);
C.phone_position = char(C.phone_position);
C.phrase_type = char(C.phrase_type);
C.phrase_boundary = char(C.phrase_boundary);

phre = find(ismember(C.phrase_boundary,{'0' '1' '2'}) & ...
            C.phone_number == C.syllable_size); % phrase endings
phrs = [1; phre(1:end-1)+1]; % phrase starts
phrlen = D.syllable_number_sentence(phre)- D.syllable_number_sentence(phrs)+1;
for i = 1:length(phrs),
  C.phrase_size(phrs(i):phre(i)) = phrlen(i);
end

% compute syllable number in phrase (instead of syllable number in sentence
% as given in the .dat file
syllable_number_in_phrase = zeros(size(S,2)/elemnr,1); % preallocate for speedup
phrase_number = 1; % number of phrase in sentence
syllable_ends = find(C.phone_number == C.syllable_size);
phrase_boundaries0 = strmatch('0', C.phrase_boundary, 'exact');
phrase_boundaries1 = strmatch('1', C.phrase_boundary, 'exact');
phrase_boundaries = intersect(syllable_ends, union(phrase_boundaries0, phrase_boundaries1));

phrase_boundaries = [0; phrase_boundaries]; % Indexes of last phone of every phrase
% look at every phrase and adjust syllable number if necessary
for i = 1:length(phrase_boundaries)-1
  if phrase_number == 1    % first phrase in a sentence
    syllable_number_in_phrase(phrase_boundaries(i)+1:phrase_boundaries(i+1)) = ...
        D.syllable_number_sentence(phrase_boundaries(i)+1:phrase_boundaries(i+1));
  else % later phrases in a sentence
    syllable_number_in_phrase(phrase_boundaries(i)+1:phrase_boundaries(i+1)) = ...
      D.syllable_number_sentence(phrase_boundaries(i)+1:phrase_boundaries(i+1)) - D.syllable_number_sentence(phrase_boundaries(i));
  end
  phrase_number = phrase_number + 1;
   % if we are at the end of a sentence, reset phrase counting 
  if strcmp(C.phrase_boundary(phrase_boundaries(i+1)), '0')
    phrase_number = 1;
  end
end
C.syllable_number = syllable_number_in_phrase;

% adapt phrase size taking only the 0 and 1 bounderies
% in the corpus phrase sizes are taken with respect to 0, 1, 2 bounderies
phrase_boundaries2 = strmatch('2', C.phrase_boundary, 'exact');
phrase_boundaries = intersect(syllable_ends, union(union(phrase_boundaries0, phrase_boundaries1), phrase_boundaries2));
phrase_boundaries = [0; phrase_boundaries]; % all phrase boundaries
i = 1;
while i <= length(phrase_boundaries) - 1
  start_index = phrase_boundaries(i) + 1;
  curr_phrase_end = i+1;
  new_phrase_size = C.phrase_size(phrase_boundaries(i+1));
  % while only 2 boundaries, increment curr_phrase_end
  while strcmp(C.phrase_boundary(phrase_boundaries(curr_phrase_end)), '2') 
    curr_phrase_end = curr_phrase_end + 1;    
    new_phrase_size = new_phrase_size +  C.phrase_size(phrase_boundaries(curr_phrase_end));
  end
  % set all elements in new phrase to new phrase size
  C.phrase_size(start_index:phrase_boundaries(curr_phrase_end)) = new_phrase_size;
  i = curr_phrase_end;
end




F = C;

