function  ueb23_4(test_str,N,dbg) 
%
%  function:  ueb23_4(test_str,N,dbg) 
%
%  Language identification: Determine whether the language of the text 
%  string `test_str` is German, English, French or Italian as follows:
%  a) Load character N-grams of these languages (assumed to be 
%     in files ".ngram_N_L.mat", where N = 1|2|3 is the
%     order of the N-gram and L = G|E|F|I the language);
%     if the N-gram files are not available, the N-grams are 
%     estimated from text in files "ueb23_data/train_text_L.txt".
%  b) Evaluate the cross entropy values of the N-grams for the   
%     test string.
%  c) Decide for language with min cross entropy
%
%  Note:  the optional input controls the amount of output
%         dbg = 0   no additional output (default)
%               1   displays the cross entropy values
%               2   displays the used N-grams 

if (nargin < 1)  
  test_str = 'das grosse wort fuehren';
elseif isempty(test_str)
  error('*** test string is emtpy')
end    
if nargin < 2
  N = 2;
elseif (N < 1) | (N > 3)
  error(['*** N-gram order must be in the range of 1...3'])
end
if nargin < 3
  dbg = 0;
end  


%---- convert test string to numeric representation ----

test_num = char_to_num_conv(test_str);
if isempty(test_num)
  disp('converted test string is empty')
  disp('(processing stopped)')
  return
end


%---- load or estimate ngrams for German, English, French and Italian ----

vocsize = 27;
smval = 0.5;   % smoothing value for N-gram estimation
dbg0 = 0;

ngram_G = load_ngram('G',N,vocsize,smval,dbg0);
ngram_E = load_ngram('E',N,vocsize,smval,dbg0);
ngram_F = load_ngram('F',N,vocsize,smval,dbg0);
ngram_I = load_ngram('I',N,vocsize,smval,dbg0);
if dbg == 2
  print_ngram_probs(ngram_G,[],[char(10) 'German ']);
  print_ngram_probs(ngram_E,[],[char(10) 'English ']);
  print_ngram_probs(ngram_F,[],[char(10) 'French ']);
  print_ngram_probs(ngram_I,[],[char(10) 'Italian ']);
end

%---- evaluate cross-entropy for input string ----

cr_entrop_G = eval_ngram_cross_entropy(test_num,ngram_G);
cr_entrop_E = eval_ngram_cross_entropy(test_num,ngram_E);
cr_entrop_F = eval_ngram_cross_entropy(test_num,ngram_F);
cr_entrop_I = eval_ngram_cross_entropy(test_num,ngram_I);

if dbg == 1
  str = 'cross entropy for '; 
  disp([str ' German  ' num2str(N) '-gram:  ' num2str(cr_entrop_G)])
  disp([str ' English ' num2str(N) '-gram:  ' num2str(cr_entrop_E)])
  disp([str ' French  ' num2str(N) '-gram:  ' num2str(cr_entrop_F)])
  disp([str ' Italian ' num2str(N) '-gram:  ' num2str(cr_entrop_I)])
end

[x,idx] = min([cr_entrop_G cr_entrop_E cr_entrop_F cr_entrop_I]);
disp(['"' test_str '"'])
switch idx
case 1
  disp('--> is German')
case 2
  disp('--> is English')
case 3
  disp('--> is French')
case 4
  disp('--> is Italian')
otherwise
  disp('cannot decide which language')
end



%----- local functions -----------------------------------


function  ngram = load_ngram(L,N,vocsize,smval,dbg)
%
%  Load N-gram from file ".ngram_N_L.mat", where N = 1|2|3 is the
%  order of the N-gram and L = G|E|F|I the language.
%  If the N-gram file is not found, the N-gram is estimated from
%  text file "ueb23_data/train_text_L.txt" and stored in file
%  ".ngram_N_L.mat" for future use.

ng_fln = ['.ngram_' num2str(N) '_' L '.mat'];
if exist(ng_fln,'file')
  load(ng_fln); 
else
  switch L
  case 'G',  lstr = 'German';
  case 'E',  lstr = 'English';
  case 'F',  lstr = 'French';
  case 'I',  lstr = 'Italian';
  end
  switch N
  case 1,  ngstr = 'unigram';
  case 2,  ngstr = 'bigram';
  case 3,  ngstr = 'trigram';
  end
  disp(['training ' ngstr ' for ' lstr ' ...'])
  tr_fln = [fullpath('ueb23_data') '/train_text_' L '.txt'];
  if ~exist(tr_fln,'file')
    error(['*** file not found:  ' tr_fln])
  end
  tr_data = load_training_text(tr_fln,1);
  ngram = estim_ngram_probs(tr_data,vocsize,N,smval,dbg);
  save(ng_fln,'ngram');
end



function  nstr = char_to_num_conv(cstr)
%
%  convert character string to numeric string

cstr = [cstr ' '];
nstr = [];
x = [];
for ii = 1:length(cstr)
  if (cstr(ii) >= 65) && (cstr(ii) <= 90)
    x = [x double(cstr(ii))-64];
  elseif (cstr(ii) >= 97) && (cstr(ii) <= 122)
    x = [x double(cstr(ii))-96];
  elseif cstr(ii) == '-'
    x = [x 27];
  elseif cstr(ii) == ' '
    if ~isempty(x)
      nstr{end+1} = x;
      x = [];
    end
  else 
    disp(['illegal input character:  "' cstr(ii) '"'])
    nstr = [];
    return
  end   
end
