git-p4: improve encoding handling to support inconsistent encodings

git-p4 is designed to run correctly under python2.7 and python3, but
its functional behavior wrt importing user-entered text differs across
these environments:

Under python2, git-p4 "naively" writes the Perforce bytestream into git
metadata (and does not set an "encoding" header on the commits); this
means that any non-utf-8 byte sequences end up creating invalidly-encoded
commit metadata in git.

Under python3, git-p4 attempts to decode the Perforce bytestream as utf-8
data, and fails badly (with an unhelpful error) when non-utf-8 data is
encountered.

Perforce clients (especially p4v) encourage user entry of changelist
descriptions (and user full names) in OS-local encoding, and store the
resulting bytestream to the server unmodified - such that different
clients can end up creating mutually-unintelligible messages. The most
common inconsistency, in many Perforce environments, is likely to be utf-8
(typical in linux) vs cp-1252 (typical in windows).

Make the changelist-description- and user-fullname-handling code
python-runtime-agnostic, introducing three "strategies" selectable via
config:
- 'passthrough', behaving as previously under python2,
- 'strict', behaving as previously under python3, and
- 'fallback', favoring utf-8 but supporting a secondary encoding when
utf-8 decoding fails, and finally escaping high-range bytes if the
decoding with the secondary encoding also fails.

Keep the python2 default behavior as-is ('legacy' strategy), but switch
the python3 default strategy to 'fallback' with default fallback encoding
'cp1252'.

Also include tests exercising these encoding strategies, documentation for
the new config, and improve the user-facing error messages when decoding
does fail.

Signed-off-by: Tao Klerks <tao@klerks.biz>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Tao Klerks
2022-04-30 19:26:52 +00:00
committed by Junio C Hamano
parent 6cd33dceed
commit f7b5ff607f
5 changed files with 572 additions and 18 deletions

123
git-p4.py
View File

@ -15,6 +15,7 @@
# pylint: disable=too-many-statements,too-many-instance-attributes
# pylint: disable=too-many-branches,too-many-nested-blocks
#
import struct
import sys
if sys.version_info.major < 3 and sys.version_info.minor < 7:
sys.stderr.write("git-p4: requires Python 2.7 or later.\n")
@ -54,6 +55,9 @@ defaultLabelRegexp = r'[a-zA-Z0-9_\-.]+$'
# The block size is reduced automatically if required
defaultBlockSize = 1<<20
defaultMetadataDecodingStrategy = 'passthrough' if sys.version_info.major == 2 else 'fallback'
defaultFallbackMetadataEncoding = 'cp1252'
p4_access_checked = False
re_ko_keywords = re.compile(br'\$(Id|Header)(:[^$\n]+)?\$')
@ -203,6 +207,70 @@ else:
def encode_text_stream(s):
return s.encode('utf_8') if isinstance(s, unicode) else s
class MetadataDecodingException(Exception):
def __init__(self, input_string):
self.input_string = input_string
def __str__(self):
return """Decoding perforce metadata failed!
The failing string was:
---
{}
---
Consider setting the git-p4.metadataDecodingStrategy config option to
'fallback', to allow metadata to be decoded using a fallback encoding,
defaulting to cp1252.""".format(self.input_string)
encoding_fallback_warning_issued = False
encoding_escape_warning_issued = False
def metadata_stream_to_writable_bytes(s):
encodingStrategy = gitConfig('git-p4.metadataDecodingStrategy') or defaultMetadataDecodingStrategy
fallbackEncoding = gitConfig('git-p4.metadataFallbackEncoding') or defaultFallbackMetadataEncoding
if not isinstance(s, bytes):
return s.encode('utf_8')
if encodingStrategy == 'passthrough':
return s
try:
s.decode('utf_8')
return s
except UnicodeDecodeError:
if encodingStrategy == 'fallback' and fallbackEncoding:
global encoding_fallback_warning_issued
global encoding_escape_warning_issued
try:
if not encoding_fallback_warning_issued:
print("\nCould not decode value as utf-8; using configured fallback encoding %s: %s" % (fallbackEncoding, s))
print("\n(this warning is only displayed once during an import)")
encoding_fallback_warning_issued = True
return s.decode(fallbackEncoding).encode('utf_8')
except Exception as exc:
if not encoding_escape_warning_issued:
print("\nCould not decode value with configured fallback encoding %s; escaping bytes over 127: %s" % (fallbackEncoding, s))
print("\n(this warning is only displayed once during an import)")
encoding_escape_warning_issued = True
escaped_bytes = b''
# bytes and strings work very differently in python2 vs python3...
if str is bytes:
for byte in s:
byte_number = struct.unpack('>B', byte)[0]
if byte_number > 127:
escaped_bytes += b'%'
escaped_bytes += hex(byte_number)[2:].upper()
else:
escaped_bytes += byte
else:
for byte_number in s:
if byte_number > 127:
escaped_bytes += b'%'
escaped_bytes += hex(byte_number).upper().encode()[2:]
else:
escaped_bytes += bytes([byte_number])
return escaped_bytes
raise MetadataDecodingException(s)
def decode_path(path):
"""Decode a given string (bytes or otherwise) using configured path encoding options
"""
@ -702,11 +770,12 @@ def p4CmdList(cmd, stdin=None, stdin_mode='w+b', cb=None, skip_info=False,
if bytes is not str:
# Decode unmarshalled dict to use str keys and values, except for:
# - `data` which may contain arbitrary binary data
# - `depotFile[0-9]*`, `path`, or `clientFile` which may contain non-UTF8 encoded text
# - `desc` or `FullName` which may contain non-UTF8 encoded text handled below, eagerly converted to bytes
# - `depotFile[0-9]*`, `path`, or `clientFile` which may contain non-UTF8 encoded text, handled by decode_path()
decoded_entry = {}
for key, value in entry.items():
key = key.decode()
if isinstance(value, bytes) and not (key in ('data', 'path', 'clientFile') or key.startswith('depotFile')):
if isinstance(value, bytes) and not (key in ('data', 'desc', 'FullName', 'path', 'clientFile') or key.startswith('depotFile')):
value = value.decode()
decoded_entry[key] = value
# Parse out data if it's an error response
@ -716,6 +785,10 @@ def p4CmdList(cmd, stdin=None, stdin_mode='w+b', cb=None, skip_info=False,
if skip_info:
if 'code' in entry and entry['code'] == 'info':
continue
if 'desc' in entry:
entry['desc'] = metadata_stream_to_writable_bytes(entry['desc'])
if 'FullName' in entry:
entry['FullName'] = metadata_stream_to_writable_bytes(entry['FullName'])
if cb is not None:
cb(entry)
else:
@ -1435,7 +1508,13 @@ class P4UserMap:
for output in p4CmdList(["users"]):
if "User" not in output:
continue
self.users[output["User"]] = output["FullName"] + " <" + output["Email"] + ">"
# "FullName" is bytes. "Email" on the other hand might be bytes
# or unicode string depending on whether we are running under
# python2 or python3. To support
# git-p4.metadataDecodingStrategy=fallback, self.users dict values
# are always bytes, ready to be written to git.
emailbytes = metadata_stream_to_writable_bytes(output["Email"])
self.users[output["User"]] = output["FullName"] + b" <" + emailbytes + b">"
self.emails[output["Email"]] = output["User"]
mapUserConfigRegex = re.compile(r"^\s*(\S+)\s*=\s*(.+)\s*<(\S+)>\s*$", re.VERBOSE)
@ -1445,26 +1524,28 @@ class P4UserMap:
user = mapUser[0][0]
fullname = mapUser[0][1]
email = mapUser[0][2]
self.users[user] = fullname + " <" + email + ">"
fulluser = fullname + " <" + email + ">"
self.users[user] = metadata_stream_to_writable_bytes(fulluser)
self.emails[email] = user
s = ''
s = b''
for (key, val) in self.users.items():
s += "%s\t%s\n" % (key.expandtabs(1), val.expandtabs(1))
keybytes = metadata_stream_to_writable_bytes(key)
s += b"%s\t%s\n" % (keybytes.expandtabs(1), val.expandtabs(1))
open(self.getUserCacheFilename(), 'w').write(s)
open(self.getUserCacheFilename(), 'wb').write(s)
self.userMapFromPerforceServer = True
def loadUserMapFromCache(self):
self.users = {}
self.userMapFromPerforceServer = False
try:
cache = open(self.getUserCacheFilename(), 'r')
cache = open(self.getUserCacheFilename(), 'rb')
lines = cache.readlines()
cache.close()
for line in lines:
entry = line.strip().split("\t")
self.users[entry[0]] = entry[1]
entry = line.strip().split(b"\t")
self.users[entry[0].decode('utf_8')] = entry[1]
except IOError:
self.getUserMapFromPerforceServer()
@ -3020,7 +3101,8 @@ class P4Sync(Command, P4UserMap):
if userid in self.users:
return self.users[userid]
else:
return "%s <a@b>" % userid
userid_bytes = metadata_stream_to_writable_bytes(userid)
return b"%s <a@b>" % userid_bytes
def streamTag(self, gitStream, labelName, labelDetails, commit, epoch):
""" Stream a p4 tag.
@ -3043,9 +3125,10 @@ class P4Sync(Command, P4UserMap):
email = self.make_email(owner)
else:
email = self.make_email(self.p4UserId())
tagger = "%s %s %s" % (email, epoch, self.tz)
gitStream.write("tagger %s\n" % tagger)
gitStream.write("tagger ")
gitStream.write(email)
gitStream.write(" %s %s\n" % (epoch, self.tz))
print("labelDetails=",labelDetails)
if 'Description' in labelDetails:
@ -3138,12 +3221,12 @@ class P4Sync(Command, P4UserMap):
self.gitStream.write("commit %s\n" % branch)
self.gitStream.write("mark :%s\n" % details["change"])
self.committedChanges.add(int(details["change"]))
committer = ""
if author not in self.users:
self.getUserMapFromPerforceServer()
committer = "%s %s %s" % (self.make_email(author), epoch, self.tz)
self.gitStream.write("committer %s\n" % committer)
self.gitStream.write("committer ")
self.gitStream.write(self.make_email(author))
self.gitStream.write(" %s %s\n" % (epoch, self.tz))
self.gitStream.write("data <<EOT\n")
self.gitStream.write(details["desc"])
@ -4055,6 +4138,14 @@ class P4Clone(P4Sync):
if self.useClientSpec_from_options:
system(["git", "config", "--bool", "git-p4.useclientspec", "true"])
# persist any git-p4 encoding-handling config options passed in for clone:
if gitConfig('git-p4.metadataDecodingStrategy'):
system(["git", "config", "git-p4.metadataDecodingStrategy", gitConfig('git-p4.metadataDecodingStrategy')])
if gitConfig('git-p4.metadataFallbackEncoding'):
system(["git", "config", "git-p4.metadataFallbackEncoding", gitConfig('git-p4.metadataFallbackEncoding')])
if gitConfig('git-p4.pathEncoding'):
system(["git", "config", "git-p4.pathEncoding", gitConfig('git-p4.pathEncoding')])
return True
class P4Unshelve(Command):