[FrontPage] [TitleIndex] [WordIndex

Note: You are looking at a static copy of the former PineWiki site, used for class notes by James Aspnes from 2003 to 2012. Many mathematical formulas are broken, and there are likely to be other bugs as well. These will most likely not be fixed. You may be able to find more up-to-date versions of some of these notes at http://www.cs.yale.edu/homes/aspnes/#classes.

1. Longest pronounceable subsequence

Let us define the longest pronounceable subsequence of a string of characters is the longest subsequence obtained by deleting zero or more characters from the original string, in which each pair of consecutive characters appear in a dictionary of pronounceable digrams. This is not a perfect model of pronounceability, but it almost works as a heuristic for languages like English.

For example, if the dictionary contains the strings AA and AB, then the strings AAA and AAB are both longest pronounceable subsequences of ACABA.

2. Your task

Your task is to write a program that extracts the longest pronounceable subsequence from its input, if it is unique. If there is more than one sequence of the same maximum length, your program should print instead the number of such sequences, formatted as with printf("%d\n", count). The dictionary of acceptable digrams is provided in a file whose name is given in argv[1], and the input sequence is just all of the characters supplied on stdin, terminated by end-of-file. Your program should compute some longest pronounceable subsequence and write it to stdout. You may assume that the input does not contain any null characters.

3. Dictionary format

The dictionary file is a sequence of 2n characters, where n is the number of digrams in the dictionary. It is terminated by end-of-file. The first digram consists of the first two characters, the second of the second two characters, and so forth. Note that spaces, tabs, newlines, and other non-printing characters may appear as part of a digram; this is necessary to allow the longest pronounceable subsequence to contain such characters.

Here are some sample dictionaries to work with:

The first is a very small dictionary for testing. The second consists of all pairs of adjacent characters from the Project Gutenberg edition of Moby Dick, after removing carriage-return characters.

4. Test cases

The longest pronounceable subsequence of abracadabra using simple.dict is abaaaba. There are two longest pronounceable subsequences of aca (aa and ac) using simple.dict. The file napoleon.1000 contains a quotation from a famous general, which can be extracted by searching for the longest pronounceable subsequence using moby.dict. More test cases will be provided in /c/cs223/Hwk7 on the Zoo.

5. Submitting your assignment

You may either submit a single file lps.c, or multiple files together with a file Makefile that will produce an executable file lps when called with make lps.

6. Hints

7. Sample solution

   1 /* compute the longest pronounceable subsequence of the input */
   2 /* Pronounceability is defined by the rule that a sequence is 
   3  * pronounceable if and only if every group of 3 consecutive
   4  * characters appears in a dictionary of permitted trigrams,
   5  * which is supplied in a file whose name is given by argv[1]. */
   6 
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <assert.h>
  10 #include <string.h>
  11 
  12 #define NUM_CHARS (256)
  13 
  14 typedef char Dict[NUM_CHARS][NUM_CHARS];
  15 
  16 /* read the dictionary from f */
  17 /* dict[i][j][k] = 0 if ijk does not appear in dictionary */
  18 /* dict[i][j][k] = 1 if ijk does appear in dictionary */
  19 static void
  20 read_dictionary(FILE *f, Dict dict)
  21 {
  22     int i;
  23     int j;
  24 
  25     for(i = 0; i < NUM_CHARS; i++) {
  26         for(j = 0; j < NUM_CHARS; j++) {
  27             dict[i][j] = 0;
  28         }
  29     }
  30 
  31     while((i = getc(f)) != EOF
  32           && (j = getc(f)) != EOF) {
  33         dict[i][j] = 1;
  34     }
  35 }
  36 
  37 /* return a null-terminated string containing entire contents of f */
  38 static char *
  39 read_all(FILE *f)
  40 {
  41     char *buf;
  42     size_t len;
  43     size_t size;
  44     int c;
  45 
  46     size = 1;
  47     buf = malloc(size);
  48     if(buf == 0) return 0;
  49     len = 0;
  50 
  51     while((c = getc(f)) != EOF) {
  52         while(len + 2 >= size) {
  53             size *= 2;
  54             buf = realloc(buf, size);
  55             if(buf == 0) return 0;
  56         }
  57         buf[len++] = c;
  58     }
  59 
  60     buf[len++] = '\0';
  61 
  62     buf = realloc(buf, len);
  63     return buf;
  64 }
  65 
  66 /* compute and return an lps of input */
  67 /* return value is a freshly-malloced null-terminated string */
  68 /* returns count of all LPSs in *count (no overflow protection!) */
  69 static char *
  70 longest_pronounceable_subsequence(const char *input, Dict dict, int *count)
  71 {
  72     const unsigned char *in;            /* unsigned version of input */
  73     char *out;
  74     int n;
  75     int i;      /* last character */
  76     int i2;     /* second-to-last character */
  77     int best;   /* best table entry */
  78     int best_length;    /* best length */
  79     int scan;   /* for reconstructing LPS */
  80 
  81     /* table[i][c] describes the LPS ending at i whose second-to-last
  82      * character is c, or just one character by itself */
  83     struct {
  84         int length;           /* length of LPS ending here */
  85         int count;            /* number of distinct LPSs of this length */
  86         int i2;               /* index of second-to-last character in some such sequence */
  87     } *table;
  88 
  89     n = strlen(input);
  90 
  91     if(n == 0) {
  92         /* unique LPS is the empty string */
  93         *count = 1;
  94         out = malloc(1);
  95         out[0] = '\0';
  96         return out;
  97     }
  98 
  99     in = (const unsigned char *) input;
 100 
 101     table = malloc(sizeof(*table) * n);
 102     assert(table != 0);
 103 
 104     for(i = 0; i < n; i++) {
 105         /* we are always allowed to do a singleton */
 106         table[i].length = 1;
 107         table[i].count = 1;
 108         table[i].i2 = n;         /* junk value */
 109 
 110         /* try all possible second-to-last characters */
 111         for(i2 = 0; i2 < i; i2++) {
 112             if(dict[in[i2]][in[i]]) {
 113                 /* possible candidate */
 114                 if(table[i2].length + 1 > table[i].length) {
 115                     /* unique so far */
 116                     table[i].length = table[i2].length + 1;
 117                     table[i].count = table[i2].count;
 118                     table[i].i2 = i2;
 119                 } else if(table[i2].length + 1 == table[i].length) {
 120                     /* add to count */
 121                     table[i].count += table[i2].count;
 122                 }
 123             }
 124         }
 125 
 126         assert(table[i].length >= 1);
 127         assert(table[i].count > 0);
 128     }
 129 
 130     /* now find the best entry */
 131     best = 0;
 132     *count = table[0].count;
 133     for(i = 1; i < n; i++) {
 134         if(table[i].length > table[best].length) {
 135             best = i;
 136             *count = table[i].count;    /* reset count */
 137         } else if(table[i].length == table[best].length) {
 138             *count += table[i].count;
 139         }
 140     }
 141 
 142     /* now we have to construct the output */
 143     best_length = table[best].length;
 144     out = malloc(best_length + 1);       /* extra 1 for nul */
 145 
 146     out[best_length] = '\0';
 147 
 148     scan = best;
 149     for(i = 0; i < best_length; i++) {
 150         assert(scan >= 0);
 151         assert(scan < n);
 152 
 153         out[best_length - i - 1] = input[scan];
 154         scan = table[scan].i2;
 155     }
 156 
 157     free(table);
 158 
 159     return out;
 160 }
 161 
 162 int
 163 main(int argc, char **argv)
 164 {
 165     char *input;
 166     char *output;
 167     Dict dict;
 168     FILE *f;
 169     int count;
 170 
 171     assert(argc > 1);
 172 
 173     f = fopen(argv[1], "r");
 174     assert(f);
 175 
 176     read_dictionary(f, dict);
 177 
 178     fclose(f);
 179 
 180     input = read_all(stdin);
 181 
 182     output = longest_pronounceable_subsequence(input, dict, &count);
 183 
 184     if(count > 1) {
 185         printf("%d\n", count);
 186     } else {
 187         fputs(output, stdout);
 188     }
 189 
 190     free(input);
 191     free(output);
 192 
 193     return 0;
 194 }
lps.c

2014-06-17 11:58