Logo Search packages:      
Sourcecode: malaga version File versions  Download package

hangul.c

/* Copyright (C) 1995 Bjoern Beutel. */

/* Description. =============================================================*/

/* This module handles the conversion of KSC5601 hangul characters and 
 * romanised Hangul to internal Malaga format (and back).
 * 
 * In this conversion module, we use four representations of
 * Hangul characters and syllables:
 * 1. KSC5601 code, used by a couple of programs (MULE for example),
 *    which codes single letters as well as whole syllables by codes
 *    that use two characters in the range of 0xa1 to 0xfe.
 * 2. Trigem code, which decodes syllables and letters by a 16-bit word
 *    divided as follows:
 *    (MSB) | S  |  I  I  I  I  I | V V V V V | F F F F F | (LSB)
 *          | 15 | 14 13 12 11 10 | 9 8 7 6 5 | 4 3 2 1 0 |
 *    where: "S" is set if the code represents a syllable (no single letter).
 *           "IIIII" is the code of the initial consonant(s).
 *           (See "initial_consonants".)
 *           "VVVVV" is the code of the vowel (see "vowels").
 *           "FFFFF" is the code of the final consonant(s).
 *           (See "final_consonants".)
 * 3. Hancode, which is the internal, letter oriented code. The letters are
 *    represented by codes in the range of 0x81-0x9a, and 0x80 marks the
 *    beginning of a syllable.
 * 4. Roman code, which is similar to Hancode, but it uses the latin letters
 *    and adopts the Yale standard for Hangul romanization.
 *    Here, every syllable begins with a dot ".". */

/* Includes. ================================================================*/

#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <setjmp.h>
#include "basic.h"
#include "pools.h"
#include "tries.h"
#include "ksc_table.h"
#include "hangul.h"

/* Constants. ===============================================================*/

/* Hancode representation of Hangul characters as strings */
#define DOT "\x80"
#define A "\x81"
#define B "\x82"
#define C "\x83"
#define D "\x84"
#define E "\x85"
#define G "\x87"
#define H "\x88"
#define I "\x89"
#define K "\x8b"
#define L "\x8c"
#define M "\x8d"
#define N "\x8e"
#define O "\x8f"
#define P "\x90"
#define S "\x93"
#define T "\x94"
#define U "\x95"
#define W "\x97"
#define X "\x98"
#define Z "\x9a"
#define NUL ""

/* Hancode representation of Hangul characters as individual characters */
#define DOT_C '\x80'
#define A_C '\x81'
#define B_C '\x82'
#define C_C '\x83'
#define D_C '\x84'
#define E_C '\x85'
#define G_C '\x87'
#define H_C '\x88'
#define I_C '\x89'
#define K_C '\x8b'
#define L_C '\x8c'
#define M_C '\x8d'
#define N_C '\x8e'
#define O_C '\x8f'
#define P_C '\x90'
#define S_C '\x93'
#define T_C '\x94'
#define U_C '\x95'
#define W_C '\x97'
#define X_C '\x98'
#define Z_C '\x9a'

#define VOWELS A E I O U W

/* The initial consonants in a syllable. */
static string_t initial_consonants[32] = 
{ 
  NUL, NUL,   K, K K,   N,   T, T T,   L, 
    M,   P, P P,   S, S S,   X,   C, C C,
    Z,   G,   D,   B,   H, NUL, NUL, NUL,
  NUL, NUL, NUL, NUL, NUL, NUL, NUL, NUL
};

/* The vowels in a syllable. */
static string_t vowels[32] = 
{ 
  NUL, NUL, NUL,   A,   A I, I A, I A I,     E,
  NUL, NUL, E I, I E, I E I,   O,   O A, O I E,
  NUL, NUL, O I, I O,     W, W E, W E I,   W I,
  NUL, NUL, I W,   U,   U I,   I,   NUL,   NUL
};

/* The final consonants in a syllable. */
static string_t final_consonants[32] = 
{
  NUL, NUL,   K, K K, K S,   N, N C, N H,
    T,   L, L K, L M, L P, L S, L D, L B,
  L H,   M, NUL,   P, P S,   S, S S,   X,
    C,   Z,   G,   D,   B,   H, NUL, NUL
};

/* Macros. ==================================================================*/

#define SYLLABLE(trigem) (((trigem) & 0x8000) != 0)

#define INITIAL_CONSONANT(trigem) (initial_consonants[ ((trigem) >> 10) & 31 ])

#define VOWEL(trigem) (vowels[ ((trigem) >> 5) & 31 ])

#define FINAL_CONSONANT(trigem) (final_consonants[ (trigem) & 31 ])

/* Global variables. ========================================================*/

bool_t convert_to_ksc; 
/* Indicates whether Hangul output is converted to KSC5601
 * (else output is converted to romanised Hangul). */

/* Variables. ===============================================================*/

static string_t hancode[ KSC_TABLE_SIZE ]; /* The Hancode for the KSC table. */

static pool_t string_pool; /* String pool with all Hancode syllables. */
static int_t *hancode_trie; /* Trie used to segmentise Hancode syllables. */
static int_t hancode_trie_root; /* Root node index of HANCODE_TRIE. */

/* Functions. ===============================================================*/

static int
compare_hancode_entries( const void *entry1, const void *entry2 )
/* Compare two trie entries. */
{ 
  return strcmp_no_case( ((trie_entry_t *) entry1)->key,
                         ((trie_entry_t *) entry2)->key );
}

/*---------------------------------------------------------------------------*/

void 
init_hangul( void )
/* Initialise the hangul module. */
{ 
  trie_entry_t hancodes[ KSC_TABLE_SIZE ]; /* KSC/Hancode pairs for trie. */
  int_t i, trigem;
  pool_t hancode_trie_pool;
  text_t *text;

  if (! hangul) 
    return;

  text = new_text();
  string_pool = new_pool( sizeof( char_t ) );
 
  /* Build Hancode strings from KSC_TABLE; copy entries to HANCODES. */
  for (i = 0; i < KSC_TABLE_SIZE; i++) 
  { 
    /* Build Hancode string out of Trigem code. */
    trigem = ksc_table[i];
    clear_text( text );
    if (SYLLABLE( trigem )) 
      add_char_to_text( text, DOT_C );
    add_to_text( text, INITIAL_CONSONANT( trigem ) );
    add_to_text( text, VOWEL( trigem ) );
    add_to_text( text, FINAL_CONSONANT( trigem ) );
    hancode[i] = copy_string_to_pool( string_pool, text->buffer, NULL );
    
    /* Copy to trie entry table. */
    hancodes[i].key = hancode[i];
    hancodes[i].content = table_index_to_ksc( i );
  }
  free_text( &text );

  /* Sort the Hancode strings and build the trie. */ 
  qsort( hancodes, KSC_TABLE_SIZE, sizeof( trie_entry_t ), 
         compare_hancode_entries);
  new_trie( KSC_TABLE_SIZE, hancodes, &hancode_trie_pool, &hancode_trie_root );
  hancode_trie = pool_to_vector( hancode_trie_pool );
  free_pool( &hancode_trie_pool );

  convert_to_ksc = FALSE;
}

/*---------------------------------------------------------------------------*/

void 
terminate_hangul( void )
/* Terminate the hangul module. */
{ 
  if (! hangul) 
    return;
  free_mem( &hancode_trie );
  free_pool( &string_pool );
}  

/* Conversion of Hancode to romanised Hangul. ===============================*/

static string_t 
hancode_to_roman( string_t hancode_string )
/* Convert Hancode string HANCODE_STRING to romanised Hangul. */
{ 
  /* Modified Yale roman representation for each of the Hancode letters. */
  static string_t romans[27] = 
  { ".", "a", "ph", "c", "th", "e", "", "kh", "h", "i", "", "k", "l", "m", "n",
    "o", "p", "", "", "s", "t", "u", "", "wu", "ng", "", "ch"
  };

  string_t hancode_string_p, roman_segment;
  text_t *roman_text;

  roman_text = new_text();
  hancode_string_p = hancode_string;
  while (*hancode_string_p != EOS) 
  { 
    if (ORD( *hancode_string_p ) >= ORD( DOT_C ) 
      && ORD( *hancode_string_p ) <= ORD( Z_C )) 
    { 
      /* Convert this character. */
      add_char_to_text( roman_text, '{' ); 
      while (ORD( *hancode_string_p ) >= ORD( DOT_C ) 
           && ORD( *hancode_string_p ) <= ORD( Z_C )) 
      { 
      roman_segment = romans[ ORD( *hancode_string_p ) - ORD( DOT_C ) ];
        if (*roman_segment == EOS) 
        complain( "Internal error." );
 
        /* Handle some special cases. */
        switch (*hancode_string_p) 
      {
        case X_C:
          if (hancode_string_p > hancode_string 
            && hancode_string_p[-1] == DOT_C) 
        { 
          roman_segment = ""; 
        }
          break;
        case I_C:
          if ((hancode_string_p > hancode_string
               && strchr( VOWELS, hancode_string_p[-1] ) != NULL) 
              || (hancode_string_p[1] != EOS 
                  && strchr( VOWELS, hancode_string_p[1] ) != NULL)) 
        { 
          roman_segment = "y"; 
        }
          break;
        case O_C:
          if (hancode_string_p[1] == A_C) 
          roman_segment = "w";
          break;
        case W_C:
          if (hancode_string_p > hancode_string && hancode_string_p[-1] == I_C)
          roman_segment = "u"; 
        else if (hancode_string_p[1] != EOS
               && strchr( VOWELS, hancode_string_p[1] ) != NULL) 
        { 
          roman_segment = "w"; 
        }
          break;
        default:
          break;
        } 
        add_to_text( roman_text, roman_segment );
        hancode_string_p++;
      }
      add_char_to_text( roman_text, '}' );
    } 
    else 
      add_char_to_text( roman_text, *hancode_string_p++ );
  }
  
  return text_to_string( &roman_text );
}

/* Conversion of romanised Hangul to Hancode. ===============================*/

static string_t 
roman_to_hancode( string_t roman_string )
/* Convert romanised Hangul string ROMAN_STRING to Hancode. */
{ 
  /* All letter sequences that can be converted to hancode. */
  static struct {string_t roman; string_t hancode;} romans[] = 
  { 
    /* Two-letter strings must come first. */
    {"ch", Z}, {"kh", G}, {"th", D}, {"ph", B}, 
    {"wu", W}, {"ng", X}, {"wa", O A}, {"yu", I W},
    
    {"a", A}, {"c", C}, {"e", E}, {"h", H}, {"i", I}, {"k", K}, {"l", L}, 
    {"m", M}, {"n", N}, {"o", O}, {"p", P}, {"r", L}, {"s", S}, {"t", T}, 
    {"u", U}, {"w", W}, {"x", X}, {"y", I}, {".", DOT},
    {NULL, NULL}
  };
  string_t roman_string_p;
  int_t i;
  text_t *hancode_text;

  hancode_text = new_text();
  roman_string_p = roman_string;
  while (*roman_string_p != EOS) 
  { 
    if (*roman_string_p == '{') 
    { 
      roman_string_p++;
      while (*roman_string_p != '}') 
      { 
      if (*roman_string_p == EOS) 
        complain( "Missing \"}\" in romanised Hangul." );

        /* Insert an "x" at beginning of syllable if vowel is following. */
        if (roman_string_p[-1] == '.' 
            && strrchr( "aeiouwy", TO_LOWER( roman_string_p[0] ) ) != NULL) 
      { 
        add_char_to_text( hancode_text, X_C ); 
      }

        for (i = 0; romans[i].roman != NULL; i++) 
      { 
        if (strncmp_no_case( roman_string_p, romans[i].roman, 
                               strlen( romans[i].roman ) ) == 0) 
        { 
          add_to_text( hancode_text, romans[i].hancode );
            roman_string_p += strlen( romans[i].roman );
            break;
          }
        }
        
        if (romans[i].roman == NULL)
      { 
        complain( "\"%c\" is not a romanised Hangul letter.", 
                *roman_string_p );
      }
      }
      /* Jump over closing "}" */
      roman_string_p++;
    } 
    else 
      add_char_to_text( hancode_text, *roman_string_p++ );
  }
  
  return text_to_string( &hancode_text );
}

/* Conversion of Hancode to KSC5601. ========================================*/

static string_t 
hancode_to_ksc( string_t hancode_string )
/* Convert Hancode string HANCODE_STRING to KSC5601 code. */
{ 
  string_t hancode_string_p;
  text_t *ksc_text;
  int_t ksc_code, code; 
  int_t trie_node;
  string_t string_p;
    
  ksc_text = new_text();
  hancode_string_p = hancode_string;
  while (*hancode_string_p != EOS) 
  { 
    /* KSC code is actually u_short_t, but trie entries are of type int_t. */
    if (ORD( *hancode_string_p ) >= ORD( DOT_C ) 
      && ORD( *hancode_string_p ) <= ORD( Z_C ))
    { 
      /* Search the trie until we have found the longest segment. */
      trie_node = hancode_trie_root;
      string_p = hancode_string_p;
      ksc_code = 0;
      while (lookup_trie( hancode_trie, &trie_node, &string_p, &code )) 
      { 
      hancode_string_p = string_p; 
        ksc_code = code;
      }
      if (ksc_code != 0) 
      { 
      add_char_to_text( ksc_text, ksc_code >> 8 );
        add_char_to_text( ksc_text, ksc_code & 0xff );
      } 
      else if (*hancode_string_p == DOT_C) 
      { 
      add_to_text( ksc_text, "{.}" );
        hancode_string_p++;
      } 
      else 
      complain( "Internal error." );
    } 
    else 
      add_char_to_text( ksc_text, *hancode_string_p++ );
  }     
  return text_to_string( &ksc_text );
}

/* Conversion of KSC5601 to Hancode. ========================================*/

static string_t 
ksc_to_hancode( string_t ksc_string )
/* Convert KSC5601 string KSC_STRING to Hancode format.
 * The returned string remains valid until this function is called again. */
{ 
  string_t ksc_string_p;
  text_t *hancode_text;
  int_t ksc_code;    

  hancode_text = new_text();
  ksc_string_p = ksc_string;
  while (*ksc_string_p != EOS) 
  { 
    if (ORD( *ksc_string_p ) < 0x80) /* Copy an ASCII character. */
      add_char_to_text( hancode_text, *ksc_string_p++ );
    else 
    { 
      /* Copy a KSC two-byte character. */
      ksc_code = (ORD( ksc_string_p[0] ) << 8) | ORD( ksc_string_p[1] );
      add_to_text( hancode_text, hancode[ ksc_to_table_index( ksc_code ) ] );
      ksc_string_p += 2;
    }
  }
  return text_to_string( &hancode_text );
}

/* Global conversion routines. ==============================================*/

void 
decode_hangul( string_t *string_p )
/* Decode *STRING_P to external format.
 * *STRING_P must be a string on the heap.
 * It will be replaced by the new string which is also on the heap. */
{ 
  string_t string;

  if (! hangul) 
    return;
  if (convert_to_ksc) 
    string = hancode_to_ksc( *string_p );
  else 
    string = hancode_to_roman( *string_p );
  free_mem( string_p );
  *string_p = string;
}

/*---------------------------------------------------------------------------*/

void 
encode_hangul( string_t *string_p )
/* Encode *STRING_P to internal format.
 * *STRING_P must be a string on the heap.
 * It will be replaced by the new string which is also on the heap. */
{ 
  string_t string;

  if (! hangul) 
    return;
  string = ksc_to_hancode( *string_p );
  free_mem( string_p );
  *string_p = roman_to_hancode( string );
  free_mem( &string );
}

/* End of file. =============================================================*/

Generated by  Doxygen 1.6.0   Back to index