From 1cf93847c1edf3128f3a2fd87db6a490470ce4ec Mon Sep 17 00:00:00 2001
From: Johannes Sixt <j6t@kdbg.org>
Date: Fri, 8 Oct 2021 19:09:53 +0000
Subject: [PATCH 1/7] t4034/cpp: actually test that operator tokens are not
 split

8d96e7288f2b (t4034: bulk verify builtin word regex sanity, 2010-12-18)
added many tests with the intent to verify that operators consisting of
more than one symbol are kept together. These are tested by probing a
transition from, e.g., a!=b to x!=y, which results in the word-diff

  [-a-]{+x+}!=[-b-]{+y+}

But that proves only that the letters and operators are separate tokens.
To prove that != is an unseparable token, we have to probe a transition
from, e.g., a=b to a!=b having a word-diff

  a[-=-]{+!=+}b

that proves that the ! is not separate from the =.

In the post-image, add to or remove from operators a character that
turns it into another valid operator.

Change the identifiers used around operators such that the diff
algorithm does not have an incentive to match, e.g., a<b in one spot
in the pre-image with a<b elsewhere in the post-image.

Adjust the expected output to match the new differences. Notice that
there are some undesirable tokenizations around e, ., and -.  This will
be addressed in a later change.

Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4034/cpp/expect | 45 +++++++++++++++------------------------------
 t/t4034/cpp/post   | 29 +++++++++++++----------------
 t/t4034/cpp/pre    | 25 +++++++++++--------------
 3 files changed, 39 insertions(+), 60 deletions(-)
diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect
index 37d1ea2587..41976971b9 100644
--- a/t/t4034/cpp/expect
+++ b/t/t4034/cpp/expect
@@ -1,36 +1,21 @@
 <BOLD>diff --git a/pre b/post<RESET>
-<BOLD>index 23d5c8a..7e8c026 100644<RESET>
+<BOLD>index c5672a2..4229868 100644<RESET>
 <BOLD>--- a/pre<RESET>
 <BOLD>+++ b/post<RESET>
-<CYAN>@@ -1,19 +1,19 @@<RESET>
+<CYAN>@@ -1,16 +1,16 @@<RESET>
 Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <GREEN>bar(x);<RESET> }
 cout<<"Hello World<RED>!<RESET><GREEN>?<RESET>\n"<<endl;
 <GREEN>(<RESET>1<GREEN>) (<RESET>-1e10<GREEN>) (<RESET>0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>y<RESET>'
-[<RED>a<RESET><GREEN>x<RESET>] <RED>a<RESET><GREEN>x<RESET>-><RED>b a<RESET><GREEN>y x<RESET>.<RED>b<RESET><GREEN>y<RESET>
-!<RED>a<RESET><GREEN>x<RESET> ~<RED>a a<RESET><GREEN>x x<RESET>++ <RED>a<RESET><GREEN>x<RESET>-- <RED>a<RESET><GREEN>x<RESET>*<RED>b a<RESET><GREEN>y x<RESET>&<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>*<RED>b a<RESET><GREEN>y x<RESET>/<RED>b a<RESET><GREEN>y x<RESET>%<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>+<RED>b a<RESET><GREEN>y x<RESET>-<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET><<<RED>b a<RESET><GREEN>y x<RESET>>><RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET><<RED>b a<RESET><GREEN>y x<RESET><=<RED>b a<RESET><GREEN>y x<RESET>><RED>b a<RESET><GREEN>y x<RESET>>=<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>==<RED>b a<RESET><GREEN>y x<RESET>!=<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>&<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>^<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>|<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>&&<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>||<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>?<RED>b<RESET><GREEN>y<RESET>:z
-<RED>a<RESET><GREEN>x<RESET>=<RED>b a<RESET><GREEN>y x<RESET>+=<RED>b a<RESET><GREEN>y x<RESET>-=<RED>b a<RESET><GREEN>y x<RESET>*=<RED>b a<RESET><GREEN>y x<RESET>/=<RED>b a<RESET><GREEN>y x<RESET>%=<RED>b a<RESET><GREEN>y x<RESET><<=<RED>b a<RESET><GREEN>y x<RESET>>>=<RED>b a<RESET><GREEN>y x<RESET>&=<RED>b a<RESET><GREEN>y x<RESET>^=<RED>b a<RESET><GREEN>y x<RESET>|=<RED>b<RESET>
-<RED>a<RESET><GREEN>y<RESET>
-<GREEN>x<RESET>,y
-<RED>a<RESET><GREEN>x<RESET>::<RED>b<RESET><GREEN>y<RESET>
+[a] b<RED>-><RESET><GREEN>->*<RESET>v d<RED>.e<RESET><GREEN>.*e<RESET>
+<GREEN>~<RESET>!a <GREEN>!<RESET>~b c<RED>++<RESET><GREEN>+<RESET> d<RED>--<RESET><GREEN>-<RESET> e*<GREEN>*<RESET>f g<RED>&<RESET><GREEN>&&<RESET>h
+a<RED>*<RESET><GREEN>*=<RESET>b c<RED>/<RESET><GREEN>/=<RESET>d e<RED>%<RESET><GREEN>%=<RESET>f
+a<RED>+<RESET><GREEN>++<RESET>b c<RED>-<RESET><GREEN>--<RESET>d
+a<RED><<<RESET><GREEN><<=<RESET>b c<RED>>><RESET><GREEN>>>=<RESET>d
+a<RED><<RESET><GREEN><=<RESET>b c<RED><=<RESET><GREEN><<RESET>d e<RED>><RESET><GREEN>>=<RESET>f g<RED>>=<RESET><GREEN>><RESET>h
+a<RED>==<RESET><GREEN>!=<RESET>b c<RED>!=<RESET><GREEN>=<RESET>d
+a<RED>^<RESET><GREEN>^=<RESET>b c<RED>|<RESET><GREEN>|=<RESET>d e<RED>&&<RESET><GREEN>&=<RESET>f
+a<RED>||<RESET><GREEN>|<RESET>b
+a?<GREEN>:<RESET>b
+a<RED>=<RESET><GREEN>==<RESET>b c<RED>+=<RESET><GREEN>+<RESET>d <RED>e-=f<RESET><GREEN>e-f<RESET> g<RED>*=<RESET><GREEN>*<RESET>h i<RED>/=<RESET><GREEN>/<RESET>j k<RED>%=<RESET><GREEN>%<RESET>l m<RED><<=<RESET><GREEN><<<RESET>n o<RED>>>=<RESET><GREEN>>><RESET>p q<RED>&=<RESET><GREEN>&<RESET>r s<RED>^=<RESET><GREEN>^<RESET>t u<RED>|=<RESET><GREEN>|<RESET>v
+a,b<RESET>
+a<RED>::<RESET><GREEN>:<RESET>b
diff --git a/t/t4034/cpp/post b/t/t4034/cpp/post
index 7e8c026cef..4229868ae6 100644
--- a/t/t4034/cpp/post
+++ b/t/t4034/cpp/post
@@ -1,19 +1,16 @@
 Foo() : x(0&42) { bar(x); }
 cout<<"Hello World?\n"<<endl;
 (1) (-1e10) (0xabcdef) 'y'
-[x] x->y x.y
-!x ~x x++ x-- x*y x&y
-x*y x/y x%y
-x+y x-y
-x<<y x>>y
-x<y x<=y x>y x>=y
-x==y x!=y
-x&y
-x^y
-x|y
-x&&y
-x||y
-x?y:z
-x=y x+=y x-=y x*=y x/=y x%=y x<<=y x>>=y x&=y x^=y x|=y
-x,y
-x::y
+[a] b->*v d.*e
+~!a !~b c+ d- e**f g&&h
+a*=b c/=d e%=f
+a++b c--d
+a<<=b c>>=d
+a<=b c<d e>=f g>h
+a!=b c=d
+a^=b c|=d e&=f
+a|b
+a?:b
+a==b c+d e-f g*h i/j k%l m<<n o>>p q&r s^t u|v
+a,b
+a:b
diff --git a/t/t4034/cpp/pre b/t/t4034/cpp/pre
index 23d5c8adf5..c5672a24cf 100644
--- a/t/t4034/cpp/pre
+++ b/t/t4034/cpp/pre
@@ -1,19 +1,16 @@
 Foo():x(0&&1){}
 cout<<"Hello World!\n"<<endl;
 1 -1e10 0xabcdef 'x'
-[a] a->b a.b
-!a ~a a++ a-- a*b a&b
-a*b a/b a%b
-a+b a-b
-a<<b a>>b
-a<b a<=b a>b a>=b
-a==b a!=b
-a&b
-a^b
-a|b
-a&&b
+[a] b->v d.e
+!a ~b c++ d-- e*f g&h
+a*b c/d e%f
+a+b c-d
+a<<b c>>d
+a<b c<=d e>f g>=h
+a==b c!=d
+a^b c|d e&&f
 a||b
-a?b:z
-a=b a+=b a-=b a*=b a/=b a%=b a<<=b a>>=b a&=b a^=b a|=b
-a,y
+a?b
+a=b c+=d e-=f g*=h i/=j k%=l m<<=n o>>=p q&=r s^=t u|=v
+a,b
 a::b

From 3e063de46e6270606e058b96bfcc0baebc4aea81 Mon Sep 17 00:00:00 2001
From: Johannes Sixt <j6t@kdbg.org>
Date: Fri, 8 Oct 2021 19:09:54 +0000
Subject: [PATCH 2/7] t4034: add tests showing problematic cpp tokenizations

The word regex is too loose and matches long streaks of characters
that should actually be separate tokens.  Add these problematic test
cases. Separate the lines with text that will remain identical in the
pre- and post-image so that the diff algorithm will not lump removals
and additions of consecutive lines together. This makes the expected
output easier to read.

Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4034/cpp/expect | 22 ++++++++++++++++++----
 t/t4034/cpp/post   | 18 ++++++++++++++++--
 t/t4034/cpp/pre    | 16 +++++++++++++++-
 3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect
index 41976971b9..63e53a61e6 100644
--- a/t/t4034/cpp/expect
+++ b/t/t4034/cpp/expect
@@ -1,11 +1,25 @@
 <BOLD>diff --git a/pre b/post<RESET>
-<BOLD>index c5672a2..4229868 100644<RESET>
+<BOLD>index 1229cdb..3feae6f 100644<RESET>
 <BOLD>--- a/pre<RESET>
 <BOLD>+++ b/post<RESET>
-<CYAN>@@ -1,16 +1,16 @@<RESET>
-Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <GREEN>bar(x);<RESET> }
+<CYAN>@@ -1,30 +1,30 @@<RESET>
+Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <RED>foo0<RESET><GREEN>bar<RESET>(x<RED>.f<RESET><GREEN>.F<RESET>ind); }
 cout<<"Hello World<RED>!<RESET><GREEN>?<RESET>\n"<<endl;
-<GREEN>(<RESET>1<GREEN>) (<RESET>-1e10<GREEN>) (<RESET>0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>y<RESET>'
+<GREEN>(<RESET>1 <RED>-1e10<RESET><GREEN>+1e10<RESET> 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>y<RESET>'
+// long double<RESET>
+<RED>3.141592653e-10l<RESET><GREEN>3.141592654e+10l<RESET>
+// float<RESET>
+120<RED>E5f<RESET><GREEN>E6f<RESET>
+// hex<RESET>
+<RED>0xdeadbeaf+8<RESET><GREEN>0xdeadBeaf+7<RESET>ULL
+// octal<RESET>
+<RED>01234567<RESET><GREEN>01234560<RESET>
+// binary<RESET>
+<RED>0b1000<RESET><GREEN>0b1100<RESET>+e1
+// expression<RESET>
+<RED>1.5-e+2+f<RESET><GREEN>1.5-e+3+f<RESET>
+// another one<RESET>
+str<RED>.e+65<RESET><GREEN>.e+75<RESET>
 [a] b<RED>-><RESET><GREEN>->*<RESET>v d<RED>.e<RESET><GREEN>.*e<RESET>
 <GREEN>~<RESET>!a <GREEN>!<RESET>~b c<RED>++<RESET><GREEN>+<RESET> d<RED>--<RESET><GREEN>-<RESET> e*<GREEN>*<RESET>f g<RED>&<RESET><GREEN>&&<RESET>h
 a<RED>*<RESET><GREEN>*=<RESET>b c<RED>/<RESET><GREEN>/=<RESET>d e<RED>%<RESET><GREEN>%=<RESET>f
diff --git a/t/t4034/cpp/post b/t/t4034/cpp/post
index 4229868ae6..3feae6f430 100644
--- a/t/t4034/cpp/post
+++ b/t/t4034/cpp/post
@@ -1,6 +1,20 @@
-Foo() : x(0&42) { bar(x); }
+Foo() : x(0&42) { bar(x.Find); }
 cout<<"Hello World?\n"<<endl;
-(1) (-1e10) (0xabcdef) 'y'
+(1 +1e10 0xabcdef) 'y'
+// long double
+3.141592654e+10l
+// float
+120E6f
+// hex
+0xdeadBeaf+7ULL
+// octal
+01234560
+// binary
+0b1100+e1
+// expression
+1.5-e+3+f
+// another one
+str.e+75
 [a] b->*v d.*e
 ~!a !~b c+ d- e**f g&&h
 a*=b c/=d e%=f
diff --git a/t/t4034/cpp/pre b/t/t4034/cpp/pre
index c5672a24cf..1229cdb59d 100644
--- a/t/t4034/cpp/pre
+++ b/t/t4034/cpp/pre
@@ -1,6 +1,20 @@
-Foo():x(0&&1){}
+Foo():x(0&&1){ foo0( x.find); }
 cout<<"Hello World!\n"<<endl;
 1 -1e10 0xabcdef 'x'
+// long double
+3.141592653e-10l
+// float
+120E5f
+// hex
+0xdeadbeaf+8ULL
+// octal
+01234567
+// binary
+0b1000+e1
+// expression
+1.5-e+2+f
+// another one
+str.e+65
 [a] b->v d.e
 !a ~b c++ d-- e*f g&h
 a*b c/d e%f

From 350b87cd658553598a269fdd320ca05ee4789a10 Mon Sep 17 00:00:00 2001
From: Johannes Sixt <j6t@kdbg.org>
Date: Fri, 8 Oct 2021 19:09:55 +0000
Subject: [PATCH 3/7] userdiff-cpp: tighten word regex

Generally, word regex can be written such that they match tokens
liberally and need not model the actual syntax because it can be assumed
that the regex will only be applied to syntactically correct text.

The regex for cpp (C/C++) is too liberal, though. It regards these
sequences as single tokens:

   1+2
   1.5-e+2+f

and the following amalgams as one token:

   .l      as in str.length
   .f      as in str.find
   .e      as in str.erase

Tighten the regex in the following way:

- Accept + and - only in one position in the exponent. + and - are no
  longer regarded as the sign of a number and are treated by the
  catcher-all that is not visible in the driver's regex.

- Accept a leading decimal point only when it is followed by a digit.

For readability, factor hex- and binary numbers into an own term.

As a drive-by, this fixes that floating point numbers such as 12E5
(with upper-case E) were split into two tokens.

Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4034/cpp/expect | 16 ++++++++--------
 userdiff.c         |  8 +++++++-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect
index 63e53a61e6..46c9460a96 100644
--- a/t/t4034/cpp/expect
+++ b/t/t4034/cpp/expect
@@ -3,24 +3,24 @@
 <BOLD>--- a/pre<RESET>
 <BOLD>+++ b/post<RESET>
 <CYAN>@@ -1,30 +1,30 @@<RESET>
-Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <RED>foo0<RESET><GREEN>bar<RESET>(x<RED>.f<RESET><GREEN>.F<RESET>ind); }
+Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <RED>foo0<RESET><GREEN>bar<RESET>(x.<RED>find<RESET><GREEN>Find<RESET>); }
 cout<<"Hello World<RED>!<RESET><GREEN>?<RESET>\n"<<endl;
-<GREEN>(<RESET>1 <RED>-1e10<RESET><GREEN>+1e10<RESET> 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>y<RESET>'
+<GREEN>(<RESET>1 <RED>-<RESET><GREEN>+<RESET>1e10 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>y<RESET>'
 // long double<RESET>
 <RED>3.141592653e-10l<RESET><GREEN>3.141592654e+10l<RESET>
 // float<RESET>
-120<RED>E5f<RESET><GREEN>E6f<RESET>
+<RED>120E5f<RESET><GREEN>120E6f<RESET>
 // hex<RESET>
-<RED>0xdeadbeaf+8<RESET><GREEN>0xdeadBeaf+7<RESET>ULL
+<RED>0xdeadbeaf<RESET><GREEN>0xdeadBeaf<RESET>+<RED>8ULL<RESET><GREEN>7ULL<RESET>
 // octal<RESET>
 <RED>01234567<RESET><GREEN>01234560<RESET>
 // binary<RESET>
 <RED>0b1000<RESET><GREEN>0b1100<RESET>+e1
 // expression<RESET>
-<RED>1.5-e+2+f<RESET><GREEN>1.5-e+3+f<RESET>
+1.5-e+<RED>2<RESET><GREEN>3<RESET>+f
 // another one<RESET>
-str<RED>.e+65<RESET><GREEN>.e+75<RESET>
-[a] b<RED>-><RESET><GREEN>->*<RESET>v d<RED>.e<RESET><GREEN>.*e<RESET>
+str.e+<RED>65<RESET><GREEN>75<RESET>
+[a] b<RED>-><RESET><GREEN>->*<RESET>v d<RED>.<RESET><GREEN>.*<RESET>e
 <GREEN>~<RESET>!a <GREEN>!<RESET>~b c<RED>++<RESET><GREEN>+<RESET> d<RED>--<RESET><GREEN>-<RESET> e*<GREEN>*<RESET>f g<RED>&<RESET><GREEN>&&<RESET>h
 a<RED>*<RESET><GREEN>*=<RESET>b c<RED>/<RESET><GREEN>/=<RESET>d e<RED>%<RESET><GREEN>%=<RESET>f
 a<RED>+<RESET><GREEN>++<RESET>b c<RED>-<RESET><GREEN>--<RESET>d
@@ -30,6 +30,6 @@ a<RED>==<RESET><GREEN>!=<RESET>b c<RED>!=<RESET><GREEN>=<RESET>d
 a<RED>^<RESET><GREEN>^=<RESET>b c<RED>|<RESET><GREEN>|=<RESET>d e<RED>&&<RESET><GREEN>&=<RESET>f
 a<RED>||<RESET><GREEN>|<RESET>b
 a?<GREEN>:<RESET>b
-a<RED>=<RESET><GREEN>==<RESET>b c<RED>+=<RESET><GREEN>+<RESET>d <RED>e-=f<RESET><GREEN>e-f<RESET> g<RED>*=<RESET><GREEN>*<RESET>h i<RED>/=<RESET><GREEN>/<RESET>j k<RED>%=<RESET><GREEN>%<RESET>l m<RED><<=<RESET><GREEN><<<RESET>n o<RED>>>=<RESET><GREEN>>><RESET>p q<RED>&=<RESET><GREEN>&<RESET>r s<RED>^=<RESET><GREEN>^<RESET>t u<RED>|=<RESET><GREEN>|<RESET>v
+a<RED>=<RESET><GREEN>==<RESET>b c<RED>+=<RESET><GREEN>+<RESET>d e<RED>-=<RESET><GREEN>-<RESET>f g<RED>*=<RESET><GREEN>*<RESET>h i<RED>/=<RESET><GREEN>/<RESET>j k<RED>%=<RESET><GREEN>%<RESET>l m<RED><<=<RESET><GREEN><<<RESET>n o<RED>>>=<RESET><GREEN>>><RESET>p q<RED>&=<RESET><GREEN>&<RESET>r s<RED>^=<RESET><GREEN>^<RESET>t u<RED>|=<RESET><GREEN>|<RESET>v
 a,b<RESET>
 a<RED>::<RESET><GREEN>:<RESET>b
diff --git a/userdiff.c b/userdiff.c
index af02b1878c..8b49194f56 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -64,8 +64,14 @@ PATTERNS("cpp",
 	 /* functions/methods, variables, and compounds at top level */
 	 "^((::[[:space:]]*)?[A-Za-z_].*)$",
 	 /* -- */
+	 /* identifiers and keywords */
 	 "[a-zA-Z_][a-zA-Z0-9_]*"
-	 "|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lLuU]*"
+	 /* decimal and octal integers as well as floatingpoint numbers */
+	 "|[0-9][0-9.]*([Ee][-+]?[0-9]+)?[fFlLuU]*"
+	 /* hexadecimal and binary integers */
+	 "|0[xXbB][0-9a-fA-F]+[lLuU]*"
+	 /* floatingpoint numbers that begin with a decimal point */
+	 "|\\.[0-9]+([Ee][-+]?[0-9]+)?[fFlL]?"
 	 "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*"),
 PATTERNS("csharp",
 	 /* Keywords */

From bfaaf191a5470cb81ce327bc3b9ef9e277c9767b Mon Sep 17 00:00:00 2001
From: Johannes Sixt <j6t@kdbg.org>
Date: Sun, 10 Oct 2021 17:03:02 +0000
Subject: [PATCH 4/7] userdiff-cpp: prepare test cases with yet unsupported
 features

We are going to add support for C++'s digit-separating single-quote and
the spaceship operator. By adding the test cases in this separate
commit, the effect on the word highlighting will become more obvious
as the features are implemented and the file cpp/expect is updated.

Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4034/cpp/expect | 14 +++++++-------
 t/t4034/cpp/post   | 12 ++++++------
 t/t4034/cpp/pre    | 10 +++++-----
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect
index 46c9460a96..3d37ddac42 100644
--- a/t/t4034/cpp/expect
+++ b/t/t4034/cpp/expect
@@ -1,21 +1,21 @@
 <BOLD>diff --git a/pre b/post<RESET>
-<BOLD>index 1229cdb..3feae6f 100644<RESET>
+<BOLD>index 144cd98..64e78af 100644<RESET>
 <BOLD>--- a/pre<RESET>
 <BOLD>+++ b/post<RESET>
 <CYAN>@@ -1,30 +1,30 @@<RESET>
 Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <RED>foo0<RESET><GREEN>bar<RESET>(x.<RED>find<RESET><GREEN>Find<RESET>); }
 cout<<"Hello World<RED>!<RESET><GREEN>?<RESET>\n"<<endl;
-<GREEN>(<RESET>1 <RED>-<RESET><GREEN>+<RESET>1e10 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>y<RESET>'
+<GREEN>(<RESET>1 <RED>-<RESET><GREEN>+<RESET>1e10 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>.<RESET>'
 // long double<RESET>
-<RED>3.141592653e-10l<RESET><GREEN>3.141592654e+10l<RESET>
+3.141'592'<RED>653e-10l<RESET><GREEN>654e+10l<RESET>
 // float<RESET>
 <RED>120E5f<RESET><GREEN>120E6f<RESET>
 // hex<RESET>
-<RED>0xdeadbeaf<RESET><GREEN>0xdeadBeaf<RESET>+<RED>8ULL<RESET><GREEN>7ULL<RESET>
+0xdead'<RED>beaf<RESET><GREEN>Beaf<RESET>+<RED>8ULL<RESET><GREEN>7ULL<RESET>
 // octal<RESET>
-<RED>01234567<RESET><GREEN>01234560<RESET>
+0123'<RED>4567<RESET><GREEN>4560<RESET>
 // binary<RESET>
-<RED>0b1000<RESET><GREEN>0b1100<RESET>+e1
+<RED>0b10<RESET><GREEN>0b11<RESET>'00+e1
 // expression<RESET>
 1.5-e+<RED>2<RESET><GREEN>3<RESET>+f
 // another one<RESET>
@@ -25,7 +25,7 @@ str.e+<RED>65<RESET><GREEN>75<RESET>
 a<RED>*<RESET><GREEN>*=<RESET>b c<RED>/<RESET><GREEN>/=<RESET>d e<RED>%<RESET><GREEN>%=<RESET>f
 a<RED>+<RESET><GREEN>++<RESET>b c<RED>-<RESET><GREEN>--<RESET>d
 a<RED><<<RESET><GREEN><<=<RESET>b c<RED>>><RESET><GREEN>>>=<RESET>d
-a<RED><<RESET><GREEN><=<RESET>b c<RED><=<RESET><GREEN><<RESET>d e<RED>><RESET><GREEN>>=<RESET>f g<RED>>=<RESET><GREEN>><RESET>h
+a<RED><<RESET><GREEN><=<RESET>b c<RED><=<RESET><GREEN><<RESET>d e<RED>><RESET><GREEN>>=<RESET>f g<RED>>=<RESET><GREEN>><RESET>h i<=<GREEN>><RESET>j
 a<RED>==<RESET><GREEN>!=<RESET>b c<RED>!=<RESET><GREEN>=<RESET>d
 a<RED>^<RESET><GREEN>^=<RESET>b c<RED>|<RESET><GREEN>|=<RESET>d e<RED>&&<RESET><GREEN>&=<RESET>f
 a<RED>||<RESET><GREEN>|<RESET>b
diff --git a/t/t4034/cpp/post b/t/t4034/cpp/post
index 3feae6f430..64e78afbfb 100644
--- a/t/t4034/cpp/post
+++ b/t/t4034/cpp/post
@@ -1,16 +1,16 @@
 Foo() : x(0&42) { bar(x.Find); }
 cout<<"Hello World?\n"<<endl;
-(1 +1e10 0xabcdef) 'y'
+(1 +1e10 0xabcdef) '.'
 // long double
-3.141592654e+10l
+3.141'592'654e+10l
 // float
 120E6f
 // hex
-0xdeadBeaf+7ULL
+0xdead'Beaf+7ULL
 // octal
-01234560
+0123'4560
 // binary
-0b1100+e1
+0b11'00+e1
 // expression
 1.5-e+3+f
 // another one
@@ -20,7 +20,7 @@ str.e+75
 a*=b c/=d e%=f
 a++b c--d
 a<<=b c>>=d
-a<=b c<d e>=f g>h
+a<=b c<d e>=f g>h i<=>j
 a!=b c=d
 a^=b c|=d e&=f
 a|b
diff --git a/t/t4034/cpp/pre b/t/t4034/cpp/pre
index 1229cdb59d..144cd980d6 100644
--- a/t/t4034/cpp/pre
+++ b/t/t4034/cpp/pre
@@ -2,15 +2,15 @@ Foo():x(0&&1){ foo0( x.find); }
 cout<<"Hello World!\n"<<endl;
 1 -1e10 0xabcdef 'x'
 // long double
-3.141592653e-10l
+3.141'592'653e-10l
 // float
 120E5f
 // hex
-0xdeadbeaf+8ULL
+0xdead'beaf+8ULL
 // octal
-01234567
+0123'4567
 // binary
-0b1000+e1
+0b10'00+e1
 // expression
 1.5-e+2+f
 // another one
@@ -20,7 +20,7 @@ str.e+65
 a*b c/d e%f
 a+b c-d
 a<<b c>>d
-a<b c<=d e>f g>=h
+a<b c<=d e>f g>=h i<=j
 a==b c!=d
 a^b c|d e&&f
 a||b

From 637b80cd6a2a73eb6723aec2f52aed1135d99de4 Mon Sep 17 00:00:00 2001
From: Johannes Sixt <j6t@kdbg.org>
Date: Sun, 10 Oct 2021 17:03:03 +0000
Subject: [PATCH 5/7] userdiff-cpp: permit the digit-separating single-quote in
 numbers

Since C++17, the single-quote can be used as digit separator:

   3.141'592'654
   1'000'000
   0xdead'beaf

Make it known to the word regex of the cpp driver, so that numbers are
not split into separate tokens at the single-quotes.

Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4034/cpp/expect | 8 ++++----
 userdiff.c         | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect
index 3d37ddac42..b90b3f207b 100644
--- a/t/t4034/cpp/expect
+++ b/t/t4034/cpp/expect
@@ -7,15 +7,15 @@ Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <RED>foo0<RESET><GREEN>bar<RESET>
 cout<<"Hello World<RED>!<RESET><GREEN>?<RESET>\n"<<endl;
 <GREEN>(<RESET>1 <RED>-<RESET><GREEN>+<RESET>1e10 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>.<RESET>'
 // long double<RESET>
-3.141'592'<RED>653e-10l<RESET><GREEN>654e+10l<RESET>
+<RED>3.141'592'653e-10l<RESET><GREEN>3.141'592'654e+10l<RESET>
 // float<RESET>
 <RED>120E5f<RESET><GREEN>120E6f<RESET>
 // hex<RESET>
-0xdead'<RED>beaf<RESET><GREEN>Beaf<RESET>+<RED>8ULL<RESET><GREEN>7ULL<RESET>
+<RED>0xdead'beaf<RESET><GREEN>0xdead'Beaf<RESET>+<RED>8ULL<RESET><GREEN>7ULL<RESET>
 // octal<RESET>
-0123'<RED>4567<RESET><GREEN>4560<RESET>
+<RED>0123'4567<RESET><GREEN>0123'4560<RESET>
 // binary<RESET>
-<RED>0b10<RESET><GREEN>0b11<RESET>'00+e1
+<RED>0b10'00<RESET><GREEN>0b11'00<RESET>+e1
 // expression<RESET>
 1.5-e+<RED>2<RESET><GREEN>3<RESET>+f
 // another one<RESET>
diff --git a/userdiff.c b/userdiff.c
index 8b49194f56..c1084650dd 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -67,11 +67,11 @@ PATTERNS("cpp",
 	 /* identifiers and keywords */
 	 "[a-zA-Z_][a-zA-Z0-9_]*"
 	 /* decimal and octal integers as well as floatingpoint numbers */
-	 "|[0-9][0-9.]*([Ee][-+]?[0-9]+)?[fFlLuU]*"
+	 "|[0-9][0-9.']*([Ee][-+]?[0-9]+)?[fFlLuU]*"
 	 /* hexadecimal and binary integers */
-	 "|0[xXbB][0-9a-fA-F]+[lLuU]*"
+	 "|0[xXbB][0-9a-fA-F']+[lLuU]*"
 	 /* floatingpoint numbers that begin with a decimal point */
-	 "|\\.[0-9]+([Ee][-+]?[0-9]+)?[fFlL]?"
+	 "|\\.[0-9][0-9']*([Ee][-+]?[0-9]+)?[fFlL]?"
 	 "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*"),
 PATTERNS("csharp",
 	 /* Keywords */

From c4fdba338355d80e40b84391af9f8c022d4f21af Mon Sep 17 00:00:00 2001
From: Johannes Sixt <j6t@kdbg.org>
Date: Sun, 10 Oct 2021 17:03:04 +0000
Subject: [PATCH 6/7] userdiff-cpp: learn the C++ spaceship operator

Since C++20, the language has a generalized comparison operator <=>.
Teach the cpp driver not to separate it into <= and > tokens.

Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4034/cpp/expect | 2 +-
 userdiff.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect
index b90b3f207b..5ff4ce477b 100644
--- a/t/t4034/cpp/expect
+++ b/t/t4034/cpp/expect
@@ -25,7 +25,7 @@ str.e+<RED>65<RESET><GREEN>75<RESET>
 a<RED>*<RESET><GREEN>*=<RESET>b c<RED>/<RESET><GREEN>/=<RESET>d e<RED>%<RESET><GREEN>%=<RESET>f
 a<RED>+<RESET><GREEN>++<RESET>b c<RED>-<RESET><GREEN>--<RESET>d
 a<RED><<<RESET><GREEN><<=<RESET>b c<RED>>><RESET><GREEN>>>=<RESET>d
-a<RED><<RESET><GREEN><=<RESET>b c<RED><=<RESET><GREEN><<RESET>d e<RED>><RESET><GREEN>>=<RESET>f g<RED>>=<RESET><GREEN>><RESET>h i<=<GREEN>><RESET>j
+a<RED><<RESET><GREEN><=<RESET>b c<RED><=<RESET><GREEN><<RESET>d e<RED>><RESET><GREEN>>=<RESET>f g<RED>>=<RESET><GREEN>><RESET>h i<RED><=<RESET><GREEN><=><RESET>j
 a<RED>==<RESET><GREEN>!=<RESET>b c<RED>!=<RESET><GREEN>=<RESET>d
 a<RED>^<RESET><GREEN>^=<RESET>b c<RED>|<RESET><GREEN>|=<RESET>d e<RED>&&<RESET><GREEN>&=<RESET>f
 a<RED>||<RESET><GREEN>|<RESET>b
diff --git a/userdiff.c b/userdiff.c
index c1084650dd..7b143ef36b 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -72,7 +72,7 @@ PATTERNS("cpp",
 	 "|0[xXbB][0-9a-fA-F']+[lLuU]*"
 	 /* floatingpoint numbers that begin with a decimal point */
 	 "|\\.[0-9][0-9']*([Ee][-+]?[0-9]+)?[fFlL]?"
-	 "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*"),
+	 "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*|<=>"),
 PATTERNS("csharp",
 	 /* Keywords */
 	 "!^[ \t]*(do|while|for|if|else|instanceof|new|return|switch|case|throw|catch|using)\n"

From 386076ec92c702104cb15bc23e4521dac10c7c2d Mon Sep 17 00:00:00 2001
From: Johannes Sixt <j6t@kdbg.org>
Date: Sun, 24 Oct 2021 11:56:43 +0200
Subject: [PATCH 7/7] userdiff-cpp: back out the digit-separators in numbers

The implementation of digit-separating single-quotes introduced a
note-worthy regression: the change of a character literal with a
digit would splice the digit and the closing single-quote. For
example, the change from 'a' to '2' is now tokenized as
'[-a'-]{+2'+} instead of '[-a-]{+2+}'.

The options to fix the regression are:

- Tighten the regular expression such that the single-quote can only
  occur between digits (that would match the official syntax).

- Remove support for digit separators.

I chose to remove support, because

- I have not seen a lot of code make use of digit separators.

- If code does use digit separators, then the numbers are typically
  long. If a change in one of the segments occurs, it is actually
  better visible if only that segment is highlighted as the word
  that changed instead of the whole long number.

This choice does introduce another minor regression, though, which
is highlighted in the test case: when a change occurs in the second
or later segment of a hexadecimal number where the segment begins
with a digit, but also has letters, the segment is mistaken as
consisting of a number and an identifier. I can live with that.

Signed-off-by: Johannes Sixt <j6t@kdbg.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4034/cpp/expect | 12 ++++++------
 t/t4034/cpp/post   | 10 +++++-----
 t/t4034/cpp/pre    |  8 ++++----
 userdiff.c         |  6 +++---
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect
index 5ff4ce477b..dc500ae092 100644
--- a/t/t4034/cpp/expect
+++ b/t/t4034/cpp/expect
@@ -1,21 +1,21 @@
 <BOLD>diff --git a/pre b/post<RESET>
-<BOLD>index 144cd98..64e78af 100644<RESET>
+<BOLD>index a1a09b7..f1b6f3c 100644<RESET>
 <BOLD>--- a/pre<RESET>
 <BOLD>+++ b/post<RESET>
 <CYAN>@@ -1,30 +1,30 @@<RESET>
 Foo() : x(0<RED>&&1<RESET><GREEN>&42<RESET>) { <RED>foo0<RESET><GREEN>bar<RESET>(x.<RED>find<RESET><GREEN>Find<RESET>); }
 cout<<"Hello World<RED>!<RESET><GREEN>?<RESET>\n"<<endl;
-<GREEN>(<RESET>1 <RED>-<RESET><GREEN>+<RESET>1e10 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>.<RESET>'
+<GREEN>(<RESET>1 <RED>-<RESET><GREEN>+<RESET>1e10 0xabcdef<GREEN>)<RESET> '<RED>x<RESET><GREEN>2<RESET>'
 // long double<RESET>
-<RED>3.141'592'653e-10l<RESET><GREEN>3.141'592'654e+10l<RESET>
+<RED>3.141592653e-10l<RESET><GREEN>3.141592654e+10l<RESET>
 // float<RESET>
 <RED>120E5f<RESET><GREEN>120E6f<RESET>
 // hex<RESET>
-<RED>0xdead'beaf<RESET><GREEN>0xdead'Beaf<RESET>+<RED>8ULL<RESET><GREEN>7ULL<RESET>
+<RED>0xdead<RESET><GREEN>0xdeaf<RESET>'1<RED>eaF<RESET><GREEN>eaf<RESET>+<RED>8ULL<RESET><GREEN>7ULL<RESET>
 // octal<RESET>
-<RED>0123'4567<RESET><GREEN>0123'4560<RESET>
+<RED>01234567<RESET><GREEN>01234560<RESET>
 // binary<RESET>
-<RED>0b10'00<RESET><GREEN>0b11'00<RESET>+e1
+<RED>0b1000<RESET><GREEN>0b1100<RESET>+e1
 // expression<RESET>
 1.5-e+<RED>2<RESET><GREEN>3<RESET>+f
 // another one<RESET>
diff --git a/t/t4034/cpp/post b/t/t4034/cpp/post
index 64e78afbfb..f1b6f3c228 100644
--- a/t/t4034/cpp/post
+++ b/t/t4034/cpp/post
@@ -1,16 +1,16 @@
 Foo() : x(0&42) { bar(x.Find); }
 cout<<"Hello World?\n"<<endl;
-(1 +1e10 0xabcdef) '.'
+(1 +1e10 0xabcdef) '2'
 // long double
-3.141'592'654e+10l
+3.141592654e+10l
 // float
 120E6f
 // hex
-0xdead'Beaf+7ULL
+0xdeaf'1eaf+7ULL
 // octal
-0123'4560
+01234560
 // binary
-0b11'00+e1
+0b1100+e1
 // expression
 1.5-e+3+f
 // another one
diff --git a/t/t4034/cpp/pre b/t/t4034/cpp/pre
index 144cd980d6..a1a09b7712 100644
--- a/t/t4034/cpp/pre
+++ b/t/t4034/cpp/pre
@@ -2,15 +2,15 @@ Foo():x(0&&1){ foo0( x.find); }
 cout<<"Hello World!\n"<<endl;
 1 -1e10 0xabcdef 'x'
 // long double
-3.141'592'653e-10l
+3.141592653e-10l
 // float
 120E5f
 // hex
-0xdead'beaf+8ULL
+0xdead'1eaF+8ULL
 // octal
-0123'4567
+01234567
 // binary
-0b10'00+e1
+0b1000+e1
 // expression
 1.5-e+2+f
 // another one
diff --git a/userdiff.c b/userdiff.c
index 7b143ef36b..8578cb0d12 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -67,11 +67,11 @@ PATTERNS("cpp",
 	 /* identifiers and keywords */
 	 "[a-zA-Z_][a-zA-Z0-9_]*"
 	 /* decimal and octal integers as well as floatingpoint numbers */
-	 "|[0-9][0-9.']*([Ee][-+]?[0-9]+)?[fFlLuU]*"
+	 "|[0-9][0-9.]*([Ee][-+]?[0-9]+)?[fFlLuU]*"
 	 /* hexadecimal and binary integers */
-	 "|0[xXbB][0-9a-fA-F']+[lLuU]*"
+	 "|0[xXbB][0-9a-fA-F]+[lLuU]*"
 	 /* floatingpoint numbers that begin with a decimal point */
-	 "|\\.[0-9][0-9']*([Ee][-+]?[0-9]+)?[fFlL]?"
+	 "|\\.[0-9][0-9]*([Ee][-+]?[0-9]+)?[fFlL]?"
 	 "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*|<=>"),
 PATTERNS("csharp",
 	 /* Keywords */