1 /************************************************************************
7 * This is an implemention of Unicode's Bidirectional Algorithm
10 * http://www.unicode.org/reports/tr9/
12 * Author: Ahmad Khalifa
15 * Revision Details: (Updated by Revision Control System)
21 * (www.arabeyes.org - under MIT license)
23 ************************************************************************/
28 * - Explicit marks need to be handled (they are not 100% now)
34 #define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
37 * Flips the text buffer, according to max level, and
41 * from: text buffer, on which to apply flipping
42 * level: resolved levels buffer
43 * max: the maximum level found in this line (should be unsigned char)
44 * count: line size in bidi_char
46 void flipThisRun(bidi_char *from, unsigned char *level, int max, int count)
48 int i, j, rcount, tlevel;
52 while(i<count && j<count)
55 /* find the start of the run of level=max */
57 i = j = findIndexOfRun(level, i, count, max);
58 /* find the end of the run */
59 while(i<count && tlevel <= level[i])
64 for(; rcount>((i-j)/2); rcount--)
66 temp = from[j+rcount-1];
67 from[j+rcount-1] = from[i-rcount];
68 from[i-rcount] = temp;
74 * Finds the index of a run with level equals tlevel
76 int findIndexOfRun(unsigned char* level , int start, int count, int tlevel)
79 for(i=start; i<count; i++)
81 if(tlevel == level[i])
90 * Returns character type of ch, by calling RLE table lookup
93 unsigned char getType(wchar_t ch)
99 * The most significant 2 bits of each level are used to store
100 * Override status of each character
101 * This function sets the override bits of level according
102 * to the value in override, and reurns the new byte.
104 unsigned char setOverrideBits(unsigned char level, unsigned char override)
108 else if(override == R)
110 else if(override == L)
116 * Find the most recent run of the same value in `level', and
117 * return the value _before_ it. Used to process U+202C POP
118 * DIRECTIONAL FORMATTING.
120 int getPreviousLevel(unsigned char* level, int from)
123 unsigned char current = level[--from];
125 while (from >= 0 && level[from] == current)
137 * Returns the first odd value greater than x
139 unsigned char leastGreaterOdd(unsigned char x)
148 * Returns the first even value greater than x
150 unsigned char leastGreaterEven(unsigned char x)
159 * Loops over the RLE_table array looking for the
162 unsigned char getRLE(wchar_t ch)
167 for(i=0; i<lenof(RLE_table); i++)
169 offset += RLE_table[i].f;
171 return RLE_table[i].d;
173 /* anything beyond the end of the table is unknown */
177 /* The Main shaping function, and the only one to be used
178 * by the outside world.
180 * line: buffer to apply shaping to. this must be passed by doBidi() first
181 * to: output buffer for the shaped data
182 * count: number of characters in line
184 int do_shape(bidi_char *line, bidi_char *to, int count)
186 int i, tempShape, ligFlag;
188 for(ligFlag=i=0; i<count; i++)
191 tempShape = STYPE(line[i].wc);
201 tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
202 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
203 to[i].wc = SFINAL((SISOLATED(line[i].wc)));
205 to[i].wc = SISOLATED(line[i].wc);
211 tempShape = (i+1 < count ? STYPE(line[i+1].wc) : SU);
212 if(line[i].wc == 0x644)
214 if (i > 0) switch(line[i-1].wc)
218 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
225 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
232 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
239 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
253 if((tempShape == SL) || (tempShape == SD) || (tempShape == SC))
255 tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
256 if((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
257 to[i].wc = SMEDIAL( (SISOLATED(line[i].wc)) );
259 to[i].wc = SFINAL((SISOLATED(line[i].wc)));
263 tempShape = (i > 0 ? STYPE(line[i-1].wc) : SU);
264 if((tempShape == SR) || (tempShape == SD) || (tempShape == SC))
265 to[i].wc = SINITIAL((SISOLATED(line[i].wc)));
267 to[i].wc = SISOLATED(line[i].wc);
277 * The Main Bidi Function, and the only function that should
278 * be used by the outside world.
280 * line: a buffer of size count containing text to apply
281 * the Bidirectional algorithm to.
284 int do_bidi(bidi_char *line, int count)
286 unsigned char* types;
287 unsigned char* levels;
288 unsigned char paragraphLevel;
289 unsigned char currentEmbedding;
290 unsigned char currentOverride;
291 unsigned char tempType;
292 int i, j, imax, yes, bover;
294 /* Check the presence of R or AL types as optimization */
296 for(i=0; i<count; i++)
298 if(getType(line[i].wc) == R || getType(line[i].wc) == AL)
307 /* Initialize types, levels */
308 types = malloc(sizeof(unsigned char) * count);
309 levels = malloc(sizeof(unsigned char) * count);
311 /* Rule (P1) NOT IMPLEMENTED
312 * P1. Split the text into separate paragraphs. A paragraph separator is
313 * kept with the previous paragraph. Within each paragraph, apply all the
314 * other rules of this algorithm.
318 * P2. In each paragraph, find the first character of type L, AL, or R.
319 * P3. If a character is found in P2 and it is of type AL or R, then set
320 * the paragraph embedding level to one; otherwise, set it to zero.
323 for( i=0; i<count ; i++)
325 if(getType(line[i].wc) == R || getType(line[i].wc) == AL)
330 else if(getType(line[i].wc) == L)
335 * X1. Begin by setting the current embedding level to the paragraph
336 * embedding level. Set the directional override status to neutral.
338 currentEmbedding = paragraphLevel;
339 currentOverride = ON;
341 /* Rule (X2), (X3), (X4), (X5), (X6), (X7), (X8)
342 * X2. With each RLE, compute the least greater odd embedding level.
343 * X3. With each LRE, compute the least greater even embedding level.
344 * X4. With each RLO, compute the least greater odd embedding level.
345 * X5. With each LRO, compute the least greater even embedding level.
346 * X6. For all types besides RLE, LRE, RLO, LRO, and PDF:
347 * a. Set the level of the current character to the current
349 * b. Whenever the directional override status is not neutral,
350 * reset the current character type to the directional
352 * X7. With each PDF, determine the matching embedding or override code.
353 * If there was a valid matching code, restore (pop) the last
354 * remembered (pushed) embedding level and directional override.
355 * X8. All explicit directional embeddings and overrides are completely
356 * terminated at the end of each paragraph. Paragraph separators are not
357 * included in the embedding. (Useless here) NOT IMPLEMENTED
360 for( i=0; i<count; i++)
362 tempType = getType(line[i].wc);
366 currentEmbedding = levels[i] = leastGreaterOdd(currentEmbedding);
367 levels[i] = setOverrideBits(levels[i], currentOverride);
368 currentOverride = ON;
372 currentEmbedding = levels[i] = leastGreaterEven(currentEmbedding);
373 levels[i] = setOverrideBits(levels[i], currentOverride);
374 currentOverride = ON;
378 currentEmbedding = levels[i] = leastGreaterOdd(currentEmbedding);
379 tempType = currentOverride = R;
384 currentEmbedding = levels[i] = leastGreaterEven(currentEmbedding);
385 tempType = currentOverride = L;
391 int prevlevel = getPreviousLevel(levels, i);
393 if (prevlevel == -1) {
394 currentEmbedding = paragraphLevel;
395 currentOverride = ON;
397 currentOverride = currentEmbedding & OMASK;
398 currentEmbedding = currentEmbedding & ~OMASK;
401 levels[i] = currentEmbedding;
404 /* Whitespace is treated as neutral for now */
407 levels[i] = currentEmbedding;
409 if(currentOverride != ON)
410 tempType = currentOverride;
414 levels[i] = currentEmbedding;
415 if(currentOverride != ON)
416 tempType = currentOverride;
422 /* this clears out all overrides, so we can use levels safely... */
423 /* checks bover first */
425 for( i=0; i<count; i++)
426 levels[i] = levels[i] & LMASK;
429 * X9. Remove all RLE, LRE, RLO, LRO, PDF, and BN codes.
430 * Here, they're converted to BN.
432 for(i=0; i<count; i++)
447 * W1. Examine each non-spacing mark (NSM) in the level run, and change
448 * the type of the NSM to the type of the previous character. If the NSM
449 * is at the start of the level run, it will get the type of sor.
452 types[0] = paragraphLevel;
454 for(i=1; i<count; i++)
457 types[i] = types[i-1];
458 /* Is this a safe assumption?
459 * I assumed the previous, IS a character.
464 * W2. Search backwards from each instance of a European number until the
465 * first strong type (R, L, AL, or sor) is found. If an AL is found,
466 * change the type of the European number to Arabic number.
468 for(i=0; i<count; i++)
479 }else if(types[j] == R || types[j] == L)
489 * W3. Change all ALs to R.
491 * Optimization: on Rule Xn, we might set a flag on AL type
492 * to prevent this loop in L R lines only...
494 for(i=0; i<count; i++)
501 * W4. A single European separator between two European numbers changes
502 * to a European number. A single common separator between two numbers
503 * of the same type changes to that type.
505 for( i=1; i<(count-1); i++)
509 if(types[i-1] == EN && types[i+1] == EN)
511 }else if(types[i] == CS)
513 if(types[i-1] == EN && types[i+1] == EN)
515 else if(types[i-1] == AN && types[i+1] == AN)
521 * W5. A sequence of European terminators adjacent to European numbers
522 * changes to all European numbers.
524 * Optimization: lots here... else ifs need rearrangement
526 for(i=0; i<count; i++)
530 if(i > 0 && types[i-1] == EN)
534 }else if(i < count-1 && types[i+1] == EN)
538 }else if(i < count-1 && types[i+1] == ET)
541 while(j <count && types[j] == ET)
552 * W6. Otherwise, separators and terminators change to Other Neutral:
554 for(i=0; i<count; i++)
567 * W7. Search backwards from each instance of a European number until
568 * the first strong type (R, L, or sor) is found. If an L is found,
569 * then change the type of the European number to L.
571 for(i=0; i<count; i++)
583 else if(types[j] == R || types[j] == AL)
593 * N1. A sequence of neutrals takes the direction of the surrounding
594 * strong text if the text on both sides has the same direction. European
595 * and Arabic numbers are treated as though they were R.
597 if(count >= 2 && types[0] == ON)
599 if((types[1] == R) || (types[1] == EN) || (types[1] == AN))
601 else if(types[1] == L)
604 for(i=1; i<(count-1); i++)
611 while(j<(count-1) && types[j] == ON)
624 }else if((types[i-1] == R) ||
625 (types[i-1] == EN) ||
629 while(j<(count-1) && types[j] == ON)
633 if((types[j] == R) ||
646 if(count >= 2 && types[count-1] == ON)
648 if(types[count-2] == R || types[count-2] == EN || types[count-2] == AN)
650 else if(types[count-2] == L)
655 * N2. Any remaining neutrals take the embedding direction.
657 for(i=0; i<count; i++)
661 if((levels[i] % 2) == 0)
669 * I1. For all characters with an even (left-to-right) embedding
670 * direction, those of type R go up one level and those of type AN or
671 * EN go up two levels.
673 for(i=0; i<count; i++)
675 if((levels[i] % 2) == 0)
679 else if(types[i] == AN || types[i] == EN)
685 * I2. For all characters with an odd (right-to-left) embedding direction,
686 * those of type L, EN or AN go up one level.
688 for(i=0; i<count; i++)
690 if((levels[i] % 2) == 1)
692 if(types[i] == L || types[i] == EN || types[i] == AN)
698 * L1. On each line, reset the embedding level of the following characters
699 * to the paragraph embedding level:
700 * (1)segment separators, (2)paragraph separators,
701 * (3)any sequence of whitespace characters preceding
702 * a segment separator or paragraph separator,
703 * (4)and any sequence of white space characters
704 * at the end of the line.
705 * The types of characters used here are the original types, not those
706 * modified by the previous phase.
709 while(j>0 && (getType(line[j].wc) == WS))
715 for(j++; j<count; j++)
716 levels[j] = paragraphLevel;
718 for(i=0; i<count; i++)
720 tempType = getType(line[i].wc);
724 while(j<count && (getType(line[j].wc) == WS))
728 if(j==count || getType(line[j].wc) == B ||
729 getType(line[j].wc) == S)
733 levels[j] = paragraphLevel;
736 }else if(tempType == B || tempType == S)
737 levels[i] = paragraphLevel;
740 /* Rule (L4) NOT IMPLEMENTED
741 * L4. A character that possesses the mirrored property as specified by
742 * Section 4.7, Mirrored, must be depicted by a mirrored glyph if the
743 * resolved directionality of that character is R.
745 /* Note: this is implemented before L2 for efficiency */
746 for(i=0; i<count; i++)
747 if((levels[i] % 2) == 1)
748 doMirror(&line[i].wc);
751 * L2. From the highest level found in the text to the lowest odd level on
752 * each line, including intermediate levels not actually present in the
753 * text, reverse any contiguous sequence of characters that are at that
756 /* we flip the character string and leave the level array */
759 tempType = levels[0];
762 if(levels[i] > tempType)
764 tempType = levels[i];
769 /* maximum level in tempType, its index in imax. */
770 while(tempType > 0) /* loop from highest level to the least odd, */
771 { /* which i assume is 1 */
772 flipThisRun(line, levels, tempType, count);
776 /* Rule (L3) NOT IMPLEMENTED
777 * L3. Combining marks applied to a right-to-left base character will at
778 * this point precede their base character. If the rendering engine
779 * expects them to follow the base characters in the final display
780 * process, then the ordering of the marks and the base character must
790 * Bad, Horrible funtion
791 * takes a pointer to a character that is checked for
792 * having a mirror glyph.
794 void doMirror(wchar_t* ch)
796 if((*ch & 0xFF00) == 0)
832 else if((*ch & 0xFF00) == 0x2000)
862 else if((*ch & 0xFF00) == 0x2200)
1209 }else if((*ch & 0xFF00) == 0x2300)
1233 else if((*ch & 0xFF00) == 0x2700)
1323 else if((*ch & 0xFF00) == 0x2900)
1455 else if((*ch & 0xFF00) == 0x2A00)
1701 else if((*ch & 0xFF00) == 0x3000)
1761 else if((*ch & 0xFF00) == 0xFF00)