1 /************************************************************************
2 * $Id: minibidi.c,v 1.1 2004/05/22 10:36:50 simon Exp $
7 * This is an implemention of Unicode's Bidirectional Algorithm
10 * http://www.unicode.org/reports/tr9/
12 * Author: Ahmad Khalifa
15 * Revision Details: (Updated by Revision Control System)
17 * $Date: 2004/05/22 10:36:50 $
20 * $Source: /u1/simon/svn-migration/cvs/putty/minibidi.c,v $
22 * (www.arabeyes.org - under MIT license)
24 ************************************************************************/
29 * - Explicit marks need to be handled (they are not 100% now)
36 * Flips the text buffer, according to max level, and
40 * from: text buffer, on which to apply flipping
41 * level: resolved levels buffer
42 * max: the maximum level found in this line (should be unsigned char)
43 * count: line size in bidi_char
45 void flipThisRun(bidi_char *from, unsigned char *level, int max, int count)
47 int i, j, rcount, tlevel;
51 while(i<count && j<count)
54 /* find the start of the run of level=max */
56 i = j = findIndexOfRun(level, i, count, max);
57 /* find the end of the run */
58 while(tlevel <= level[i] && i<count)
63 for(; rcount>((i-j)/2); rcount--)
65 temp = from[j+rcount-1];
66 from[j+rcount-1] = from[i-rcount];
67 from[i-rcount] = temp;
73 * Finds the index of a run with level equals tlevel
75 int findIndexOfRun(unsigned char* level , int start, int count, int tlevel)
78 for(i=start; i<count; i++)
80 if(tlevel == level[i])
89 * Returns character type of ch, by calling RLE table lookup
92 unsigned char getType(wchar_t ch)
98 * The most significant 2 bits of each level are used to store
99 * Override status of each character
100 * This function sets the override bits of level according
101 * to the value in override, and reurns the new byte.
103 unsigned char setOverrideBits(unsigned char level, unsigned char override)
107 else if(override == R)
109 else if(override == L)
114 /* Dont remember what this was used for :-) */
115 unsigned char getPreviousLevel(unsigned char* level, int from)
117 unsigned char current;
119 current = level[from];
120 while(from>0 && level[from] == current)
124 return level[++from];
128 * Returns the first odd value greater than x
130 unsigned char leastGreaterOdd(unsigned char x)
139 * Returns the first even value greater than x
141 unsigned char leastGreaterEven(unsigned char x)
150 * Loops over the RLE_table array looking for the
153 unsigned char getRLE(wchar_t ch)
158 for(i=0; i<0xFFFF; i++)
160 freq = ((RLENode*)RLE_table)[i].f;
163 return ((RLENode*)RLE_table)[i].d;
165 return ((RLENode*)RLE_table)[i-1].d;
167 /* this is here to stop compiler nagging */
171 /* The Main shaping function, and the only one to be used
172 * by the outside world.
174 * line: buffer to apply shaping to. this must be passed by doBidi() first
175 * to: output buffer for the shaped data
176 * count: number of characters in line
178 int do_shape(bidi_char *line, bidi_char *to, int count)
180 int i, tempShape, ligFlag;
182 for(ligFlag=i=0; i<count; i++)
185 tempShape = STYPE(line[i].wc);
195 tempShape = STYPE(line[i+1].wc);
196 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
197 to[i].wc = SFINAL((SISOLATED(line[i].wc)));
199 to[i].wc = SISOLATED(line[i].wc);
205 tempShape = STYPE(line[i+1].wc);
206 if(line[i].wc == 0x644)
212 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
219 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
226 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
233 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
247 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
249 tempShape = STYPE(line[i-1].wc);
250 if((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
251 to[i].wc = SMEDIAL( (SISOLATED(line[i].wc)) );
253 to[i].wc = SFINAL((SISOLATED(line[i].wc)));
257 tempShape = STYPE(line[i-1].wc);
258 if((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
259 to[i].wc = SINITIAL((SISOLATED(line[i].wc)));
261 to[i].wc = SISOLATED(line[i].wc);
271 * The Main Bidi Function, and the only function that should
272 * be used by the outside world.
274 * line: a buffer of size count containing text to apply
275 * the Bidirectional algorithm to.
278 int do_bidi(bidi_char *line, int count)
280 unsigned char* types;
281 unsigned char* levels;
282 unsigned char paragraphLevel;
283 unsigned char currentEmbedding;
284 unsigned char currentOverride;
285 unsigned char tempType;
286 int i, j, imax, yes, bover;
288 /* Check the presence of R or AL types as optimization */
290 for(i=0; i<count; i++)
292 if(getType(line[i].wc) == R || getType(line[i].wc) == AL)
301 /* Initialize types, levels */
302 types = malloc(sizeof(unsigned char) * count);
303 levels = malloc(sizeof(unsigned char) * count);
305 /* Rule (P1) NOT IMPLEMENTED
306 * P1. Split the text into separate paragraphs. A paragraph separator is
307 * kept with the previous paragraph. Within each paragraph, apply all the
308 * other rules of this algorithm.
312 * P2. In each paragraph, find the first character of type L, AL, or R.
313 * P3. If a character is found in P2 and it is of type AL or R, then set
314 * the paragraph embedding level to one; otherwise, set it to zero.
317 for( i=0; i<count ; i++)
319 if(getType(line[i].wc) == R || getType(line[i].wc) == AL)
324 else if(getType(line[i].wc) == L)
329 * X1. Begin by setting the current embedding level to the paragraph
330 * embedding level. Set the directional override status to neutral.
332 currentEmbedding = paragraphLevel;
333 currentOverride = ON;
335 /* Rule (X2), (X3), (X4), (X5), (X6), (X7), (X8)
336 * X2. With each RLE, compute the least greater odd embedding level.
337 * X3. With each LRE, compute the least greater even embedding level.
338 * X4. With each RLO, compute the least greater odd embedding level.
339 * X5. With each LRO, compute the least greater even embedding level.
340 * X6. For all types besides RLE, LRE, RLO, LRO, and PDF:
341 * a. Set the level of the current character to the current
343 * b. Whenever the directional override status is not neutral,
344 * reset the current character type to the directional
346 * X7. With each PDF, determine the matching embedding or override code.
347 * If there was a valid matching code, restore (pop) the last
348 * remembered (pushed) embedding level and directional override.
349 * X8. All explicit directional embeddings and overrides are completely
350 * terminated at the end of each paragraph. Paragraph separators are not
351 * included in the embedding. (Useless here) NOT IMPLEMENTED
354 for( i=0; i<count; i++)
356 tempType = getType(line[i].wc);
360 currentEmbedding = levels[i] = leastGreaterOdd(currentEmbedding);
361 levels[i] = setOverrideBits(levels[i], currentOverride);
362 currentOverride = ON;
366 currentEmbedding = levels[i] = leastGreaterEven(currentEmbedding);
367 levels[i] = setOverrideBits(levels[i], currentOverride);
368 currentOverride = ON;
372 currentEmbedding = levels[i] = leastGreaterOdd(currentEmbedding);
373 tempType = currentOverride = R;
378 currentEmbedding = levels[i] = leastGreaterEven(currentEmbedding);
379 tempType = currentOverride = L;
384 currentEmbedding = getPreviousLevel(levels, i);
385 currentOverride = currentEmbedding & OMASK;
386 currentEmbedding = currentEmbedding & ~OMASK;
387 levels[i] = currentEmbedding;
390 /* Whitespace is treated as neutral for now */
393 levels[i] = currentEmbedding;
395 if(currentOverride != ON)
396 tempType = currentOverride;
400 levels[i] = currentEmbedding;
401 if(currentOverride != ON)
402 tempType = currentOverride;
408 /* this clears out all overrides, so we can use levels safely... */
409 /* checks bover first */
411 for( i=0; i<count; i++)
412 levels[i] = levels[i] & LMASK;
415 * X9. Remove all RLE, LRE, RLO, LRO, PDF, and BN codes.
416 * Here, they're converted to BN.
418 for(i=0; i<count; i++)
433 * W1. Examine each non-spacing mark (NSM) in the level run, and change
434 * the type of the NSM to the type of the previous character. If the NSM
435 * is at the start of the level run, it will get the type of sor.
438 types[0] = paragraphLevel;
440 for(i=1; i<count; i++)
443 types[i] = types[i-1];
444 /* Is this a safe assumption?
445 * I assumed the previous, IS a character.
450 * W2. Search backwards from each instance of a European number until the
451 * first strong type (R, L, AL, or sor) is found. If an AL is found,
452 * change the type of the European number to Arabic number.
454 for(i=0; i<count; i++)
465 }else if(types[j] == R || types[j] == L)
475 * W3. Change all ALs to R.
477 * Optimization: on Rule Xn, we might set a flag on AL type
478 * to prevent this loop in L R lines only...
480 for(i=0; i<count; i++)
487 * W4. A single European separator between two European numbers changes
488 * to a European number. A single common separator between two numbers
489 * of the same type changes to that type.
491 for( i=0; i<(count-1); i++)
495 if(types[i-1] == EN && types[i+1] == EN)
497 }else if(types[i] == CS)
499 if(types[i-1] == EN && types[i+1] == EN)
501 else if(types[i-1] == AN && types[i+1] == AN)
507 * W5. A sequence of European terminators adjacent to European numbers
508 * changes to all European numbers.
510 * Optimization: lots here... else ifs need rearrangement
512 for(i=0; i<count; i++)
520 }else if(types[i+1] == EN)
524 }else if(types[i+1] == ET)
527 while(j <count && types[j] == ET)
538 * W6. Otherwise, separators and terminators change to Other Neutral:
540 for(i=0; i<count; i++)
553 * W7. Search backwards from each instance of a European number until
554 * the first strong type (R, L, or sor) is found. If an L is found,
555 * then change the type of the European number to L.
557 for(i=0; i<count; i++)
569 else if(types[j] == R || types[j] == AL)
579 * N1. A sequence of neutrals takes the direction of the surrounding
580 * strong text if the text on both sides has the same direction. European
581 * and Arabic numbers are treated as though they were R.
585 if((types[1] == R) || (types[1] == EN) || (types[1] == AN))
587 else if(types[1] == L)
590 for(i=1; i<(count-1); i++)
597 while(j<(count-1) && types[j] == ON)
610 }else if((types[i-1] == R) ||
611 (types[i-1] == EN) ||
615 while(j<(count-1) && types[j] == ON)
619 if((types[j] == R) ||
632 if(types[count-1] == ON)
634 if(types[count-2] == R || types[count-2] == EN || types[count-2] == AN)
636 else if(types[count-2] == L)
641 * N2. Any remaining neutrals take the embedding direction.
643 for(i=0; i<count; i++)
647 if((levels[i] % 2) == 0)
655 * I1. For all characters with an even (left-to-right) embedding
656 * direction, those of type R go up one level and those of type AN or
657 * EN go up two levels.
659 for(i=0; i<count; i++)
661 if((levels[i] % 2) == 0)
665 else if(types[i] == AN || types[i] == EN)
671 * I2. For all characters with an odd (right-to-left) embedding direction,
672 * those of type L, EN or AN go up one level.
674 for(i=0; i<count; i++)
676 if((levels[i] % 2) == 1)
678 if(types[i] == L || types[i] == EN || types[i] == AN)
684 * L1. On each line, reset the embedding level of the following characters
685 * to the paragraph embedding level:
686 * (1)segment separators, (2)paragraph separators,
687 * (3)any sequence of whitespace characters preceding
688 * a segment separator or paragraph separator,
689 * (4)and any sequence of white space characters
690 * at the end of the line.
691 * The types of characters used here are the original types, not those
692 * modified by the previous phase.
695 while(j>0 && (getType(line[j].wc) == WS))
701 for(j++; j<count; j++)
702 levels[j] = paragraphLevel;
704 for(i=0; i<count; i++)
706 tempType = getType(line[i].wc);
710 while(j<count && (getType(line[j].wc) == WS))
714 if(getType(line[j].wc) == B || getType(line[j].wc) == S)
718 levels[j] = paragraphLevel;
721 }else if(tempType == B || tempType == S)
722 levels[i] = paragraphLevel;
725 /* Rule (L4) NOT IMPLEMENTED
726 * L4. A character that possesses the mirrored property as specified by
727 * Section 4.7, Mirrored, must be depicted by a mirrored glyph if the
728 * resolved directionality of that character is R.
730 /* Note: this is implemented before L2 for efficiency */
731 for(i=0; i<count; i++)
732 if((levels[i] % 2) == 1)
733 doMirror(&line[i].wc);
736 * L2. From the highest level found in the text to the lowest odd level on
737 * each line, including intermediate levels not actually present in the
738 * text, reverse any contiguous sequence of characters that are at that
741 /* we flip the character string and leave the level array */
744 tempType = levels[0];
747 if(levels[i] > tempType)
749 tempType = levels[i];
754 /* maximum level in tempType, its index in imax. */
755 while(tempType > 0) /* loop from highest level to the least odd, */
756 { /* which i assume is 1 */
757 flipThisRun(line, levels, tempType, count);
761 /* Rule (L3) NOT IMPLEMENTED
762 * L3. Combining marks applied to a right-to-left base character will at
763 * this point precede their base character. If the rendering engine
764 * expects them to follow the base characters in the final display
765 * process, then the ordering of the marks and the base character must
775 * Bad, Horrible funtion
776 * takes a pointer to a character that is checked for
777 * having a mirror glyph.
779 void doMirror(wchar_t* ch)
781 if((*ch & 0xFF00) == 0)
817 else if((*ch & 0xFF00) == 0x2000)
847 else if((*ch & 0xFF00) == 0x2200)
1194 }else if((*ch & 0xFF00) == 0x2300)
1218 else if((*ch & 0xFF00) == 0x2700)
1308 else if((*ch & 0xFF00) == 0x2900)
1440 else if((*ch & 0xFF00) == 0x2A00)
1686 else if((*ch & 0xFF00) == 0x3000)
1746 else if((*ch & 0xFF00) == 0xFF00)