D8 Shell

Print Optimization Stats

Xét đoạn script sau:

function test( obj ) {
  return obj.prop + obj.prop;
}
 
var a = { prop: 'a' }, i = 0;
 
while (i++ < 10000) {
  test(a);
}

Sử dụng option --trace-opt-verbose để in các thông số tối ưu trong quá trình chạy:

$ ./d8 --trace-opt-verbose optimized.js
[delaying optimization of test, IC changed]
[marking 0x07e8000da919 <JSFunction test (sfi = 0x7e8000da845)> for optimization to MAGLEV, ConcurrencyMode::kConcurrent, reason: hot and stable]
[compiling method 0x07e8000da919 <JSFunction test (sfi = 0x7e8000da845)> (target MAGLEV), mode: ConcurrencyMode::kConcurrent]
[completed compiling 0x07e8000da919 <JSFunction test (sfi = 0x7e8000da845)> (target MAGLEV) - took 0.000, 2.446, 0.089 ms]
[marking 0x07e8000da8e1 <JSFunction (sfi = 0x7e8000da7dd)> for optimization to MAGLEV, ConcurrencyMode::kConcurrent, reason: hot and stable]
[compiling method 0x07e8000da8e1 <JSFunction (sfi = 0x7e8000da7dd)> (target MAGLEV) OSR, mode: ConcurrencyMode::kConcurrent]
[not marking function  (INTERPRETED_FUNCTION) for optimization: already queued]
...
[completed compiling 0x07e8000da8e1 <JSFunction (sfi = 0x7e8000da7dd)> (target MAGLEV) OSR - took 0.001, 1.936, 0.194 ms]
[compiling method 0x07e8000da8e1 <JSFunction (sfi = 0x7e8000da7dd)> (target TURBOFAN) OSR, mode: ConcurrencyMode::kConcurrent]
[not marking function  (MAGLEV) for optimization: already queued]
...
[completed compiling 0x07e8000da8e1 <JSFunction (sfi = 0x7e8000da7dd)> (target TURBOFAN) OSR - took 0.063, 11.521, 0.237 ms]
[completed optimizing 0x07e8000da8e1 <JSFunction (sfi = 0x7e8000da7dd)> (target TURBOFAN) OSR]

Important

Về bản chất, sau khi hàm test() được gọi một vài lần, V8 sẽ đánh dấu nó là “hot and stable” rồi biên dịch bằng Maglev.

Ngoài test(), còn có một hàm không có tên ở địa chỉ 0x28bc000da7dd được đánh dấu là “hot and stable” và được biên dịch bởi Maglev. Có thể thấy trong output ở trên, hàm ở địa chỉ 0x28bc000da7dd còn được biên dịch bởi TurboFan.

Print Deoptimization Stats

Sử dụng đoạn script sau:

function test( obj ) {
  return obj.prop + obj.prop;
}
 
var a = { prop: 'a' }, b = { prop: [] }, i = 0;
 
while ( i++ < 10000 ) {
  test( i !== 8000 ? a : b );
}

Output:

$ ./d8 --trace-opt-verbose deoptimized.js
[delaying optimization of test, IC changed]
[marking 0x007f000da94d <JSFunction test (sfi = 0x7f000da849)> for optimization to MAGLEV, ConcurrencyMode::kConcurrent, reason: hot and stable]
[compiling method 0x007f000da94d <JSFunction test (sfi = 0x7f000da849)> (target MAGLEV), mode: ConcurrencyMode::kConcurrent]
[marking 0x007f000da915 <JSFunction (sfi = 0x7f000da7dd)> for optimization to MAGLEV, ConcurrencyMode::kConcurrent, reason: hot and stable]
[compiling method 0x007f000da915 <JSFunction (sfi = 0x7f000da7dd)> (target MAGLEV) OSR, mode: ConcurrencyMode::kConcurrent]
[completed compiling 0x007f000da94d <JSFunction test (sfi = 0x7f000da849)> (target MAGLEV) - took 0.001, 2.357, 0.081 ms]
[not marking function  (INTERPRETED_FUNCTION) for optimization: already queued]
...
[not marking function  (BASELINE) for optimization: already queued]
...
[completed compiling 0x007f000da915 <JSFunction (sfi = 0x7f000da7dd)> (target MAGLEV) OSR - took 0.001, 23.602, 0.097 ms]
[compiling method 0x007f000da915 <JSFunction (sfi = 0x7f000da7dd)> (target TURBOFAN) OSR, mode: ConcurrencyMode::kConcurrent]
[not marking function  (MAGLEV) for optimization: already queued]
...
[marking 0x007f000da94d <JSFunction test (sfi = 0x7f000da849)> for optimization to MAGLEV, ConcurrencyMode::kConcurrent, reason: hot and stable]
[compiling method 0x007f000da94d <JSFunction test (sfi = 0x7f000da849)> (target MAGLEV), mode: ConcurrencyMode::kConcurrent]
[not marking function  (BASELINE) for optimization: already queued]
[completed compiling 0x007f000da915 <JSFunction (sfi = 0x7f000da7dd)> (target TURBOFAN) OSR - took 0.093, 17.264, 0.240 ms]
[completed optimizing 0x007f000da915 <JSFunction (sfi = 0x7f000da7dd)> (target TURBOFAN) OSR]
[completed compiling 0x007f000da94d <JSFunction test (sfi = 0x7f000da849)> (target MAGLEV) - took 0.001, 0.565, 0.030 ms]

Có thể thấy, hàm test() được đánh dấu và biên dịch lại 2 lần bởi Maglev:

Lần thứ 1: khi V8 thấy rằng đối số truyền vào hàm test() luôn có dạng {prop: <String>}. Đến vòng lặp thứ 8001, shape của đối số (hidden class) bị thay đổi và điều này dẫn đến quá trình deoptimization.
Lần thứ 2: khi V8 lại quan sát thấy đối số truyền vào hàm test() có hidden class cố định từ vòng lặp 8002 đến vòng lặp 10000.

Profiling

Xét đoạn script sau:

function factorial( n ) {
  return n === 1 ? n : n * factorial( --n );
}
 
var i = 0;
 
while ( i++ < 1e7 ) {
  factorial( 10 );
}

Dùng lệnh time:

$ time ./d8 --prof profiling.js
./d8 --prof profiling.js  2.47s user 0.34s system 101% cpu 2.776 total

Với 2.47s là thời gian sử dụng CPU ở user mode còn 0.34s là thời gian sử dụng CPU ở kernel mode.

Option --prof sẽ sinh ra file v8.log. Để tìm ra hàm sử dụng nhiều thời gian nhất, có thể dùng công cụ v8/tools/linux-tick-processor:

$ ./v8/v8/tools/linux-tick-processor ~/cves/v8-out-of-bound/v8.log

Output có đoạn như sau:

 [JavaScript]:
   ticks  total  nonlib   name
   1521   84.1%   84.7%  JS: *factorial profiling.js:1:19
    151    8.3%    8.4%  JS: *<anonymous> profiling.js:1:1

Có thể thấy, hàm factorial chiếm nhiều thời gian nhất. Ký tự * ở trước tên hàm cho biết nó đã được tối ưu.

Sửa script lại như sau:

function factorial( n ) {
  return equal( n, 1 ) ? n : multiply( n, factorial( --n ) );
}
 
function multiply( x, y ) {
  return x * y;
}
 
function equal( a, b ) {
  return a === b;
}
 
var i = 0;
 
while ( i++ < 1e7 ) {
  factorial( 10 );
}

Chạy time thì thấy thời gian thực thi không xê dịch gì nhiều:

$ time ./d8 --prof profiling-modified.js
./d8 --prof profiling-modified.js  2.10s user 0.58s system 82% cpu 3.245 total

Output khi phân tích file v8.log:

 [JavaScript]:
   ticks  total  nonlib   name
   1708   83.8%   84.6%  JS: *factorial profiling-modified.js:1:19
    138    6.8%    6.8%  JS: *<anonymous> profiling-modified.js:1:1

Ta nhận thấy rằng hàm equal() và multiply() không tồn tại ở trong file log.

Kiểm tra turbo inlining bằng option --trace-turbo-inlining:

$ ./d8 --trace-turbo-inlining profiling-modified.js

Ta thấy các dòng sau:

Considering 0x55e230c06d20 {0x114f000da8f9 <SharedFunctionInfo equal>} for inlining with 0x55e230c07320 {0x114f000dab4d <FeedbackVector[1]>}
Inlining small function(s) at call site #22:JSCall
Inlining 0x55e230c06d20 {0x114f000da8f9 <SharedFunctionInfo equal>} into 0x55e230c06be8 {0x114f000da879 <SharedFunctionInfo factorial>}
 
Considering 0x55e230c070e0 {0x114f000da8b9 <SharedFunctionInfo multiply>} for inlining with 0x55e230c079c8 {0x114f000dac91 <FeedbackVector[1]>}
Inlining small function(s) at call site #50:JSCall
Inlining 0x55e230c070e0 {0x114f000da8b9 <SharedFunctionInfo multiply>} into 0x55e230c06be8 {0x114f000da879 <SharedFunctionInfo factorial>}

Important

Có thể thấy, compiler không gọi đến hàm equal() và multiply() mà thay vào đó là gom chúng với hàm factorial().

Cụ thể hơn, đoạn machine code tối ưu cho hai đoạn script trên về cơ bản là tương tự nhau. Ta có thể kiểm chứng điều này bằng option --print-opt-code.

So sánh giữa machine code của profiling.js và profiling-modified.js: profiling.diff - Diffchecker

Tracing Garbage Collection

Xét script sau:

function strToArray( str ) {
  var i = 0,
    len = str.length,
    arr = new Uint16Array( str.length );
  for ( ; i < len; ++i ) {
    arr[ i ] = str.charCodeAt( i );
  }
  return arr;
}
 
var i = 0, str = 'V8 is the collest';
 
while ( i++ < 1e5 ) {
  strToArray( str );
}

Dùng option --trace-gc để in ra thông tin của garbage collector khi chạy script:

[8978:0x5567f18e5000]      222 ms: Scavenge 1.1 (2.0) -> 0.2 (3.0) MB, 7.11 / 0.00 ms  (average mu = 1.000, current mu = 1.000) allocation failure;
[8978:0x5567f18e5000]      246 ms: Scavenge 1.1 (3.0) -> 0.2 (3.0) MB, 1.08 / 0.00 ms  (average mu = 1.000, current mu = 1.000) allocation failure;
[8978:0x5567f18e5000]      264 ms: Scavenge 1.1 (3.0) -> 0.2 (3.0) MB, 0.36 / 0.00 ms  (average mu = 1.000, current mu = 1.000) allocation failure;
...

Đa số các object được cấp phát vào vùng young generation trên heap vì tốn ít chi phí. Khi vùng nhớ này đầy, garbage collector sẽ tiến hành thu gom rác (Scavenge).

Important

Việc thu gom rác sẽ chỉ diễn ra nếu cần cấp phát thêm vùng nhớ. Nói cách khác, nếu vùng young generation không bao giờ đầy, quá trình thu gom rác sẽ không xảy ra.

Sửa lại đoạn script trên như sau:

function strToArray( str, bufferView ) {
  var i = 0,
    len = str.length;
  for ( ; i < len; ++i ) {
    bufferView[ i ] = str.charCodeAt( i );
  }
  return bufferView;
}
 
var i = 0,
  str = 'V8 is the coolest',
  buffer = new ArrayBuffer( str.length * 2 ),
  bufferView = new Uint16Array( buffer );
 
while ( i++ < 1e5 ) {
  strToArray( str, bufferView );
}

Đoạn script trên cấp phát sẵn vùng nhớ ArrayBuffer và dùng lại vùng nhớ Uint16Array mỗi lần chạy hàm strToArray() nhằm giảm thiểu việc cấp phát thêm vùng nhớ mới.

Khi chạy d8 với option --trace-gc thì không có thông tin Scavenge nào được hiển thị.

Chỉnh sửa đoạn script thêm một lần nữa:

function strToArray( str ) {
  var i = 0,
    len = str.length,
    arr = new Uint16Array( str.length );
  for ( ; i < len; ++i ) {
    arr[ i ] = str.charCodeAt( i );
  }
  return arr;
}
 
var i = 0, str = 'V8 is the coolest', arr = [];
 
while ( i++ < 1e6 ) {
  strToArray( str );
  if ( i % 100000 === 0 ) {
    // save a long-term reference to a random, huge object
    arr.push( new Uint16Array( 100000000 ) );
    // release references about 5% of the time
    Math.random() > 0.95 && ( arr.length = 0 );
  }
}

Đoạn script trên ngoài cấp phát vùng nhớ mỗi lần gọi hàm strToArray() thì còn lưu tham chiếu đến một object lớn rồi giải phóng nó trong khoảng 5% thời gian thực thi của script.

Có thể thấy, log của garbage collector ngoài các dòng Scavenge thì còn có các dòng Mark-Compact. Các dòng này được chạy khi vùng old generation đạt một kích thước nhất định nào đó. Mark-Compact thường có thời gian thực thi lâu hơn Scavenge.

Some Useful Options

Một số option hữu ích:

--print-opt-code: in ra các đoạn code đã được tối ưu hóa.
--print-bytecode: in ra bytecode được sinh ra bởi intepreter.
--trace-ic: in ra các loại object có trong inline cache.
--trace-deopt: theo dõi các hàm bị deoptimized.

list
from outgoing([[D8 Shell]])
sort file.ctime asc

Quartz 🪬

Explorer

D8 Shell

Print Optimization Stats

Print Deoptimization Stats

Profiling

Tracing Garbage Collection

Some Useful Options

Resources

Table of Contents

Backlinks

Quartz 🪬

Explorer

D8 Shell

Print Optimization Stats

Print Deoptimization Stats

Profiling

Tracing Garbage Collection

Some Useful Options

Related

Resources

Table of Contents

Backlinks