Skip to content

Commit 3e82ed4

Browse files
tonyliaosscopybara-github
authored andcommitted
Generate internal hasbits for singular proto3 implicit presence fields.
N.B.: - This change is not intended to affect any well-defined protobuf behaviour in an observable way. - The wire parsing codepath is not affected. - This change only affects the C++ protobuf implementation (other languages are not affected). - sizeof proto3 message objects may increase in 32-bit increments to accommodate hasbits. - When profiled on some of Google's largest binaries, we have seen a code size increase of ~0.1%, which we consider to be a reasonable increase. There are quite a few terminologies in the title: - **singular**: a field that is not repeated, not oneof, not extension, not lazy, just a field with a simple primitive type (number or boolean), or string/bytes. - **proto3**: describes behaviour consistent to the "proto3" syntax. This is equivalent to `edition = "2023"` with `option features.field_presence = IMPLICIT;`. - **implicit presence**: describes behaviour consistent with "non-optional" fields in proto3. This is described in more detail in https://protobuf.dev/programming-guides/field_presence/#presence-in-proto3-apis This change enables C++ proto3 objects to generate hasbits for regular proto3 (i.e. non-`optional`) fields. This code change might make certain codepaths negligibly more efficient, but large improvement or regression is unlikely. A larger performance improvement is expected from generating hasbits for repeated fields -- this change will pave the way for future work there. Hasbits in C++ will have slightly different semantics for implicit presence fields. In the past, all hasbits are true field presence indicators. If the hasbit is set, the field is guaranteed to be present; if the hasbit is unset, the field is guaranteed to be missing. This change introduces a new hasbit mode that I will call "hint hasbits", denoted by a newly-introduced enum, `internal::cpp::HasbitMode::kHintHasbit`. For implicit presence fields, it may be possible to mutate the field and have it end up as a zero field, especially with `mutable_foo` APIs. To handle those cases correctly, we unconditionally set the hasbit when `mutable_foo` is called, then we must do an additional check for field emptiness before serializing the field onto the wire. PiperOrigin-RevId: 691945237
1 parent 56580bd commit 3e82ed4

File tree

9 files changed

+603
-96
lines changed

9 files changed

+603
-96
lines changed

src/google/protobuf/compiler/cpp/field_generators/string_field.cc

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,23 @@
1616
#include "absl/log/absl_check.h"
1717
#include "absl/memory/memory.h"
1818
#include "absl/strings/str_cat.h"
19+
#include "absl/strings/string_view.h"
1920
#include "google/protobuf/compiler/cpp/field.h"
2021
#include "google/protobuf/compiler/cpp/field_generators/generators.h"
2122
#include "google/protobuf/compiler/cpp/helpers.h"
2223
#include "google/protobuf/compiler/cpp/options.h"
2324
#include "google/protobuf/descriptor.h"
2425
#include "google/protobuf/descriptor.pb.h"
2526
#include "google/protobuf/io/printer.h"
27+
#include "google/protobuf/port.h"
2628

2729
namespace google {
2830
namespace protobuf {
2931
namespace compiler {
3032
namespace cpp {
3133
namespace {
34+
using ::google::protobuf::internal::cpp::GetFieldHasbitMode;
35+
using ::google::protobuf::internal::cpp::HasbitMode;
3236
using ::google::protobuf::internal::cpp::HasHasbit;
3337
using ::google::protobuf::io::AnnotationCollector;
3438
using Sub = ::google::protobuf::io::Printer::Sub;
@@ -533,6 +537,22 @@ void SingularString::GenerateClearingCode(io::Printer* p) const {
533537
)cc");
534538
}
535539

540+
// Returns "ClearNonDefaultToEmpty" or "ClearToEmpty" depending on whether the
541+
// field might still point to the default string instance.
542+
absl::string_view GetClearFunctionForField(const FieldDescriptor* field) {
543+
switch (GetFieldHasbitMode(field)) {
544+
case HasbitMode::kNoHasbit:
545+
case HasbitMode::kHintHasbit:
546+
// TODO: b/376149315 - Would be nice to call ClearNonDefaultToEmpty for
547+
// hint hasbits too.
548+
return "ClearToEmpty";
549+
case HasbitMode::kTrueHasbit:
550+
return "ClearNonDefaultToEmpty";
551+
default:
552+
internal::Unreachable();
553+
}
554+
}
555+
536556
void SingularString::GenerateMessageClearingCode(io::Printer* p) const {
537557
if (is_oneof()) {
538558
p->Emit(R"cc(
@@ -573,8 +593,7 @@ void SingularString::GenerateMessageClearingCode(io::Printer* p) const {
573593
return;
574594
}
575595

576-
p->Emit({{"Clear",
577-
HasHasbit(field_) ? "ClearNonDefaultToEmpty" : "ClearToEmpty"}},
596+
p->Emit({{"Clear", GetClearFunctionForField(field_)}},
578597
R"cc(
579598
$field_$.$Clear$();
580599
)cc");

src/google/protobuf/compiler/cpp/message.cc

Lines changed: 61 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ namespace cpp {
6262
namespace {
6363
using ::google::protobuf::internal::WireFormat;
6464
using ::google::protobuf::internal::WireFormatLite;
65+
using ::google::protobuf::internal::cpp::GetFieldHasbitMode;
66+
using ::google::protobuf::internal::cpp::HasbitMode;
6567
using ::google::protobuf::internal::cpp::HasHasbit;
6668
using Semantic = ::google::protobuf::io::AnnotationCollector::Semantic;
6769
using Sub = ::google::protobuf::io::Printer::Sub;
@@ -187,7 +189,7 @@ RunMap FindRuns(const std::vector<const FieldDescriptor*>& fields,
187189

188190
void EmitNonDefaultCheck(io::Printer* p, const std::string& prefix,
189191
const FieldDescriptor* field) {
190-
ABSL_CHECK(!HasHasbit(field));
192+
ABSL_CHECK(GetFieldHasbitMode(field) != HasbitMode::kTrueHasbit);
191193
ABSL_CHECK(!field->is_repeated());
192194
ABSL_CHECK(!field->containing_oneof() || field->real_containing_oneof());
193195

@@ -216,19 +218,27 @@ void EmitNonDefaultCheck(io::Printer* p, const std::string& prefix,
216218
}
217219

218220
bool ShouldEmitNonDefaultCheck(const FieldDescriptor* field) {
219-
return (!field->is_repeated() && !field->containing_oneof()) ||
220-
field->real_containing_oneof();
221+
if (GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit) {
222+
return false;
223+
}
224+
return !field->is_repeated();
221225
}
222226

223227
// Emits an if-statement with a condition that evaluates to true if |field| is
224228
// considered non-default (will be sent over the wire), for message types
225229
// without true field presence. Should only be called if
226230
// !HasHasbit(field).
231+
// If |with_enclosing_braces_always| is set to true, will generate enclosing
232+
// braces even if nondefault check is not emitted -- i.e. code may look like:
233+
// {
234+
// // code...
235+
// }
236+
// If |with_enclosing_braces_always| is set to false, enclosing braces will not
237+
// be generated if nondefault check is not emitted.
227238
void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix,
228239
const FieldDescriptor* field,
229-
absl::AnyInvocable<void()> emit_body) {
230-
ABSL_CHECK(!HasHasbit(field));
231-
240+
absl::AnyInvocable<void()> emit_body,
241+
bool with_enclosing_braces_always) {
232242
if (ShouldEmitNonDefaultCheck(field)) {
233243
p->Emit(
234244
{
@@ -240,7 +250,10 @@ void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix,
240250
$emit_body$;
241251
}
242252
)cc");
243-
} else {
253+
return;
254+
}
255+
256+
if (with_enclosing_braces_always) {
244257
// In repeated fields, the same variable name may be emitted multiple
245258
// times, hence the need for emitting braces even when the if condition is
246259
// not necessary, so that the code looks like:
@@ -259,7 +272,11 @@ void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix,
259272
$emit_body$;
260273
}
261274
)cc");
275+
return;
262276
}
277+
278+
// If no enclosing braces need to be emitted, just emit the body directly.
279+
emit_body();
263280
}
264281

265282
bool HasInternalHasMethod(const FieldDescriptor* field) {
@@ -1034,7 +1051,7 @@ void MessageGenerator::GenerateSingularFieldHasBits(
10341051
)cc");
10351052
return;
10361053
}
1037-
if (HasHasbit(field)) {
1054+
if (GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit) {
10381055
auto v = p->WithVars(HasBitVars(field));
10391056
p->Emit(
10401057
{Sub{"ASSUME",
@@ -1244,7 +1261,8 @@ void MessageGenerator::EmitCheckAndUpdateByteSizeForField(
12441261
};
12451262

12461263
if (!HasHasbit(field)) {
1247-
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body));
1264+
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body),
1265+
/*with_enclosing_braces_always=*/true);
12481266
return;
12491267
}
12501268
if (field->options().weak()) {
@@ -1260,10 +1278,17 @@ void MessageGenerator::EmitCheckAndUpdateByteSizeForField(
12601278
int has_bit_index = has_bit_indices_[field->index()];
12611279
p->Emit({{"mask",
12621280
absl::StrFormat("0x%08xu", uint32_t{1} << (has_bit_index % 32))},
1263-
{"emit_body", [&] { emit_body(); }}},
1281+
{"check_nondefault_and_emit_body",
1282+
[&] {
1283+
// Note that it's possible that the field has explicit presence.
1284+
// In that case, nondefault check will not be emitted but
1285+
// emit_body will still be emitted.
1286+
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body),
1287+
/*with_enclosing_braces_always=*/false);
1288+
}}},
12641289
R"cc(
12651290
if (cached_has_bits & $mask$) {
1266-
$emit_body$;
1291+
$check_nondefault_and_emit_body$;
12671292
}
12681293
)cc");
12691294
}
@@ -4227,9 +4252,10 @@ void MessageGenerator::GenerateClassSpecificMergeImpl(io::Printer* p) {
42274252
} else if (field->is_optional() && !HasHasbit(field)) {
42284253
// Merge semantics without true field presence: primitive fields are
42294254
// merged only if non-zero (numeric) or non-empty (string).
4230-
MayEmitIfNonDefaultCheck(p, "from.", field, /*emit_body=*/[&]() {
4231-
generator.GenerateMergingCode(p);
4232-
});
4255+
MayEmitIfNonDefaultCheck(
4256+
p, "from.", field,
4257+
/*emit_body=*/[&]() { generator.GenerateMergingCode(p); },
4258+
/*with_enclosing_braces_always=*/true);
42334259
} else if (field->options().weak() ||
42344260
cached_has_word_index != HasWordIndex(field)) {
42354261
// Check hasbit, not using cached bits.
@@ -4250,10 +4276,20 @@ void MessageGenerator::GenerateClassSpecificMergeImpl(io::Printer* p) {
42504276
format("if (cached_has_bits & 0x$1$u) {\n", mask);
42514277
format.Indent();
42524278

4253-
if (check_has_byte && IsPOD(field)) {
4254-
generator.GenerateCopyConstructorCode(p);
4279+
if (GetFieldHasbitMode(field) == HasbitMode::kHintHasbit) {
4280+
// Merge semantics without true field presence: primitive fields are
4281+
// merged only if non-zero (numeric) or non-empty (string).
4282+
MayEmitIfNonDefaultCheck(
4283+
p, "from.", field,
4284+
/*emit_body=*/[&]() { generator.GenerateMergingCode(p); },
4285+
/*with_enclosing_braces_always=*/false);
42554286
} else {
4256-
generator.GenerateMergingCode(p);
4287+
ABSL_DCHECK(GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit);
4288+
if (check_has_byte && IsPOD(field)) {
4289+
generator.GenerateCopyConstructorCode(p);
4290+
} else {
4291+
generator.GenerateMergingCode(p);
4292+
}
42574293
}
42584294

42594295
format.Outdent();
@@ -4476,7 +4512,12 @@ void MessageGenerator::GenerateSerializeOneField(io::Printer* p,
44764512
if (HasHasbit(field)) {
44774513
p->Emit(
44784514
{
4479-
{"body", emit_body},
4515+
{"body",
4516+
[&]() {
4517+
MayEmitIfNonDefaultCheck(p, "this_.", field,
4518+
std::move(emit_body),
4519+
/*with_enclosing_braces_always=*/false);
4520+
}},
44804521
{"cond",
44814522
[&] {
44824523
int has_bit_index = HasBitIndex(field);
@@ -4496,7 +4537,8 @@ void MessageGenerator::GenerateSerializeOneField(io::Printer* p,
44964537
}
44974538
)cc");
44984539
} else if (field->is_optional()) {
4499-
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body));
4540+
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body),
4541+
/*with_enclosing_braces_always=*/true);
45004542
} else {
45014543
emit_body();
45024544
}

src/google/protobuf/descriptor.cc

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9798,9 +9798,27 @@ bool HasPreservingUnknownEnumSemantics(const FieldDescriptor* field) {
97989798
return field->enum_type() != nullptr && !field->enum_type()->is_closed();
97999799
}
98009800

9801+
HasbitMode GetFieldHasbitMode(const FieldDescriptor* field) {
9802+
// Do not generate hasbits for "real-oneof" and weak fields.
9803+
if (field->real_containing_oneof() || field->options().weak()) {
9804+
return HasbitMode::kNoHasbit;
9805+
}
9806+
9807+
// Explicit-presence fields always have true hasbits.
9808+
if (field->has_presence()) {
9809+
return HasbitMode::kTrueHasbit;
9810+
}
9811+
9812+
// Implicit presence fields.
9813+
if (!field->is_repeated()) {
9814+
return HasbitMode::kHintHasbit;
9815+
}
9816+
// We currently don't implement hasbits for implicit repeated fields.
9817+
return HasbitMode::kNoHasbit;
9818+
}
9819+
98019820
bool HasHasbit(const FieldDescriptor* field) {
9802-
return field->has_presence() && !field->real_containing_oneof() &&
9803-
!field->options().weak();
9821+
return GetFieldHasbitMode(field) != HasbitMode::kNoHasbit;
98049822
}
98059823

98069824
static bool IsVerifyUtf8(const FieldDescriptor* field, bool is_lite) {

src/google/protobuf/descriptor.h

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include "google/protobuf/stubs/common.h"
4343
#include "absl/base/attributes.h"
4444
#include "absl/base/call_once.h"
45+
#include "absl/base/optimization.h"
4546
#include "absl/container/btree_map.h"
4647
#include "absl/container/flat_hash_map.h"
4748
#include "absl/functional/any_invocable.h"
@@ -2949,9 +2950,33 @@ constexpr int MaxMessageDeclarationNestingDepth() { return 32; }
29492950
PROTOBUF_EXPORT bool HasPreservingUnknownEnumSemantics(
29502951
const FieldDescriptor* field);
29512952

2952-
PROTOBUF_EXPORT bool HasHasbit(const FieldDescriptor* field);
2953-
29542953
#ifndef SWIG
2954+
enum class HasbitMode : uint8_t {
2955+
// Hasbits do not exist for the field.
2956+
kNoHasbit,
2957+
// Hasbits exist and indicate field presence.
2958+
// Hasbit is set if and only if field is present.
2959+
kTrueHasbit,
2960+
// Hasbits exist and "hint at" field presence.
2961+
// When hasbit is set, field is 'probably' present, but field accessors must
2962+
// still check for field presence (i.e. false positives are possible).
2963+
// When hasbit is unset, field is guaranteed to be not present.
2964+
kHintHasbit,
2965+
};
2966+
2967+
// Returns the "hasbit mode" of the field. Depending on the implementation, a
2968+
// field can:
2969+
// - have no hasbits in its internal object (kNoHasbit);
2970+
// - have hasbits where hasbit == 1 indicates field presence and hasbit == 0
2971+
// indicates an unset field (kTrueHasbit);
2972+
// - have hasbits where hasbit == 1 indicates "field is possibly modified" and
2973+
// hasbit == 0 indicates "field is definitely missing" (kHintHasbit).
2974+
PROTOBUF_EXPORT HasbitMode GetFieldHasbitMode(const FieldDescriptor* field);
2975+
2976+
// Returns true if there are hasbits for the field.
2977+
// Note that this does not correlate with "hazzer"s, i.e., whether has_foo APIs
2978+
// are emitted.
2979+
PROTOBUF_EXPORT bool HasHasbit(const FieldDescriptor* field);
29552980

29562981
enum class Utf8CheckMode : uint8_t {
29572982
kStrict = 0, // Parsing will fail if non UTF-8 data is in string fields.

0 commit comments

Comments
 (0)