diff mbox

RFR: Add support for G1GC

Message ID 1395139945.16914.4.camel@localhost.localdomain
State New
Headers show

Commit Message

Edward Nevill March 18, 2014, 10:52 a.m. UTC
Hi,

The following patch adds support for G1GC.

This is disabled by default and only enabled with the -XX:+UseG1GC option.

I have tested this against JTreg hotspot and it gives no additional failures/errors.

OK to push?
Ed.

--- CUT HERE ---
exporting patch:
# HG changeset patch
# User Edward Nevill edward.nevill@linaro.org
# Date 1395139429 0
#      Tue Mar 18 10:43:49 2014 +0000
# Node ID 53205a277e07e8be32c4592ba0982f7bc3817717
# Parent  939480aaf1b23f1013de7bca05dd6a2c3cef3430
Add support for G1GC

Comments

Andrew Haley March 18, 2014, 11:12 a.m. UTC | #1
On 03/18/2014 10:52 AM, Edward Nevill wrote:
> Hi,
> 
> The following patch adds support for G1GC.
> 
> This is disabled by default and only enabled with the -XX:+UseG1GC option.
> 
> I have tested this against JTreg hotspot and it gives no additional failures/errors.
> 
> OK to push?

One or two questions inline:

> --- CUT HERE ---
> exporting patch:
> # HG changeset patch
> # User Edward Nevill edward.nevill@linaro.org
> # Date 1395139429 0
> #      Tue Mar 18 10:43:49 2014 +0000
> # Node ID 53205a277e07e8be32c4592ba0982f7bc3817717
> # Parent  939480aaf1b23f1013de7bca05dd6a2c3cef3430
> Add support for G1GC
> 
> diff -r 939480aaf1b2 -r 53205a277e07 src/cpu/aarch64/vm/aarch64.ad
> --- a/src/cpu/aarch64/vm/aarch64.ad	Tue Mar 11 15:44:21 2014 +0000
> +++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Mar 18 10:43:49 2014 +0000
> @@ -5112,6 +5112,19 @@
>  
>  // Store Instructions
>  
> +// Store CMS card-mark Immediate
> +instruct storeimmCM0(immI0 zero, memory mem)
> +%{
> +  match(Set mem (StoreCM mem zero));
> +
> +  ins_cost(MEMORY_REF_COST);
> +  format %{ "strb zr, $mem\t# byte" %}
> +
> +  ins_encode(aarch64_enc_strb0(mem));

Are you sure this isn't a store release?  I'm not sure.

> @@ -1866,47 +1866,47 @@
>  void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst) {
>    
>    assert(left->is_single_cpu() || left->is_double_cpu(), "expect single or double register");
> -  if (left->is_single_cpu()) {
> -    assert (right->is_single_cpu() || right->is_constant(), "single register or constant expected");
> -    if (right->is_constant()
> -	&& Assembler::operand_valid_for_logical_immediate(true, right->as_jint())) {
> -
> -      switch (code) {
> -      case lir_logic_and: __ andw (dst->as_register(), left->as_register(), right->as_jint()); break;
> -      case lir_logic_or:  __ orrw (dst->as_register(), left->as_register(), right->as_jint()); break;
> -      case lir_logic_xor: __ eorw (dst->as_register(), left->as_register(), right->as_jint()); break;
> -      default: ShouldNotReachHere(); break;
> -      }
> -    } else {
> -      switch (code) {
> -      case lir_logic_and: __ andw (dst->as_register(), left->as_register(), right->as_register()); break;
> -      case lir_logic_or:  __ orrw (dst->as_register(), left->as_register(), right->as_register()); break;
> -      case lir_logic_xor: __ eorw (dst->as_register(), left->as_register(), right->as_register()); break;
> -      default: ShouldNotReachHere(); break;
> -      }
> -    }
> -  } else {
> -    assert (right->is_double_cpu() || right->is_constant(), "single register or constant expected");
> -    if (right->is_double_cpu()) {
> -      switch (code) {
> -      case lir_logic_and: __ andr(dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
> -      case lir_logic_or:  __ orr (dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
> -      case lir_logic_xor: __ eor (dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
> -      default:
> -	ShouldNotReachHere();
> -	break;
> -      }
> -    } else {
> -      switch (code) {
> -      case lir_logic_and: __ andr(dst->as_register_lo(), left->as_register_lo(), right->as_jlong()); break;
> -      case lir_logic_or:  __ orr (dst->as_register_lo(), left->as_register_lo(), right->as_jlong()); break;
> -      case lir_logic_xor: __ eor (dst->as_register_lo(), left->as_register_lo(), right->as_jlong()); break;
> -      default:
> -	ShouldNotReachHere();
> -	break;
> -      }
> -    }
> -  }
> +  Register Rleft = left->is_single_cpu() ? left->as_register() :
> +                                           left->as_register_lo();
> +   if (dst->is_single_cpu()) {
> +     Register Rdst = dst->as_register();
> +     if (right->is_constant()) {
> +       switch (code) {
> +         case lir_logic_and: __ andw (Rdst, Rleft, right->as_jint()); break;
> +         case lir_logic_or:  __ orrw (Rdst, Rleft, right->as_jint()); break;
> +         case lir_logic_xor: __ eorw (Rdst, Rleft, right->as_jint()); break;
> +         default: ShouldNotReachHere(); break;
> +       }
> +     } else {
> +       Register Rright = right->is_single_cpu() ? right->as_register() :
> +                                                  right->as_register_lo();
> +       switch (code) {
> +         case lir_logic_and: __ andw (Rdst, Rleft, Rright); break;
> +         case lir_logic_or:  __ orrw (Rdst, Rleft, Rright); break;
> +         case lir_logic_xor: __ eorw (Rdst, Rleft, Rright); break;
> +         default: ShouldNotReachHere(); break;
> +       }
> +     }
> +   } else {
> +     Register Rdst = dst->as_register_lo();
> +     if (right->is_constant()) {
> +       switch (code) {
> +         case lir_logic_and: __ andr (Rdst, Rleft, right->as_jlong()); break;
> +         case lir_logic_or:  __ orr (Rdst, Rleft, right->as_jlong()); break;
> +         case lir_logic_xor: __ eor (Rdst, Rleft, right->as_jlong()); break;
> +         default: ShouldNotReachHere(); break;
> +       }
> +     } else {
> +       Register Rright = right->is_single_cpu() ? right->as_register() :
> +                                                  right->as_register_lo();
> +       switch (code) {
> +         case lir_logic_and: __ andr (Rdst, Rleft, Rright); break;
> +         case lir_logic_or:  __ orr (Rdst, Rleft, Rright); break;
> +         case lir_logic_xor: __ eor (Rdst, Rleft, Rright); break;
> +         default: ShouldNotReachHere(); break;
> +       }
> +     }
> +   }
>  }

What is this rewrite of logic_op for?  Is it part of this patch?

> +	//__ push(r0->bit(1) | r1->bit(1), sp);
> +	__ push(r0->bit(1) | r1->bit(1) | rscratch1->bit(1) | rscratch2->bit(1), sp);

What is the commented-out code for?  Why is this particular set of registers
pushed?

> +  // Calling the runtime using the regular call_VM_leaf mechanism generates
> +  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
> +  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.

Does it really?

Otherwise this is OK.

Andrew.
Edward Nevill March 21, 2014, 1:59 p.m. UTC | #2
On Tue, 2014-03-18 at 11:12 +0000, Andrew Haley wrote:
> On 03/18/2014 10:52 AM, Edward Nevill wrote:
> > +// Store CMS card-mark Immediate
> > +instruct storeimmCM0(immI0 zero, memory mem)
> > +%{
> > +  match(Set mem (StoreCM mem zero));
> > +
> > +  ins_cost(MEMORY_REF_COST);
> > +  format %{ "strb zr, $mem\t# byte" %}
> > +
> > +  ins_encode(aarch64_enc_strb0(mem));
> 
> Are you sure this isn't a store release?  I'm not sure.

I don't believe a store release is necessary.

It does a full mem barrier in g1_write_barrier_post.

See jdk8/hotspot/src/share/vm/opto/graphKit.cpp

   4014           // Use Op_MemBarVolatile to achieve the effect of a StoreLoad barrier.
   4015           insert_mem_bar(Op_MemBarVolatile, oop_store);
   4016           __ sync_kit(this);
   4017 
   4018           Node* card_val_reload = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw);
   4019           __ if_then(card_val_reload, BoolTest::ne, dirty_card); {
   4020             g1_mark_card(ideal, card_adr, oop_store, alias_idx, index, index_adr, buffer, tf);

This is also in line with what is done for C1 and for the template interpreter.

> 
> > @@ -1866,47 +1866,47 @@
> >  void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst) {

> What is this rewrite of logic_op for?  Is it part of this patch?

OK. So I found that LIR was being passed to logic_op with a mixture of single and double CPU registers.

This was from LIRGenerator::G1SATBCardTableModRef_post_barrier in src/share/vm/c1/c1_LIRGenerator.cpp.

I started hacking G1SATBCardTableModRef_post_barrier but there were more cases and the hack started to grow.

Rather than put a growing AARCH64 specific hack in shared code I decided that the most expedient think to do would be to make logic_op accept the mix. This is what happens on x86 in any case.

I predicated this on the size of dst. IE if dst is 32 bit then the whole expression must be 32 bit, otherwise if the dst is 64 bit the operands are treated as 64 bit.

> 
> > +	//__ push(r0->bit(1) | r1->bit(1), sp);
> > +	__ push(r0->bit(1) | r1->bit(1) | rscratch1->bit(1) | rscratch2->bit(1), sp);
> 
> What is the commented-out code for?  Why is this particular set of registers
> pushed?

It should be push(r0, r1), the push of rscratch1, rscratch2 is unnecessary.

r0 needs to be saved, I also push r1 because it is free and because I need to save r0..r7 around the call to g1_wb_post later and by saving r1 here I only have to save r2..r7 later.

> 
> > +  // Calling the runtime using the regular call_VM_leaf mechanism generates
> > +  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
> > +  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.

It seems to check rfp + frame::interpreter_frame_last_sp.

Thanks for the review!

Ed.
Andrew Haley March 21, 2014, 3:44 p.m. UTC | #3
On 03/21/2014 01:59 PM, Edward Nevill wrote:
>> What is the commented-out code for?  Why is this particular set of registers
>> > pushed?
> It should be push(r0, r1), the push of rscratch1, rscratch2 is
> unnecessary.
> 
> r0 needs to be saved, I also push r1 because it is free and because
> I need to save r0..r7 around the call to g1_wb_post later and by
> saving r1 here I only have to save r2..r7 later.

But why do you not need to save all registers?  This is in the middle
of C1-generated code, isn't it?  So anything may be live.  And the
native code you're calling may trash these registers.  I think you need
everything up to r18, excluding rscratch1 and rscratch2.

Andrew.
Andrew Haley March 24, 2014, 6:33 p.m. UTC | #4
On 03/24/2014 04:48 PM, Edward Nevill wrote:
> I have made the following changes.
> 
> - Use adrp(... cardtable, offset) instead of mov.
> 
> In order for this to work I had to add a new relocation to pd_patch_instruction to patch the sequence
> 
> 	adrp	Rx, cardtable, offset
> 	add	Ry, Ry, Rx
> 	ldrb	Rz, Address(Ry, offset)
> 
> - Save R18 and rework the register usage
> 
> I have reworked the register usage in C1 so we do not need to save R0/R1 in the common case and only need to save registers at all around a call to g1_wb_post or g1_wb_pre.
> 
> - Utilise the fact that dirty_card_val() == 0
> 
> Optimise by using cmpzw and cbz where appropriate in both c1 and template interpreter
> 
> assert added to check that dirty_card_val() is in fact 0.
> 
> - Merged several instruction pairs to use 1 instruction instead of 2.
> 
> Revised patch below.

OK, thanks.

Andrew.
diff mbox

Patch

diff -r 939480aaf1b2 -r 53205a277e07 src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Mar 11 15:44:21 2014 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Mar 18 10:43:49 2014 +0000
@@ -5112,6 +5112,19 @@ 
 
 // Store Instructions
 
+// Store CMS card-mark Immediate
+instruct storeimmCM0(immI0 zero, memory mem)
+%{
+  match(Set mem (StoreCM mem zero));
+
+  ins_cost(MEMORY_REF_COST);
+  format %{ "strb zr, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_strb0(mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
 // Store Byte
 instruct storeB(iRegI src, memory mem)
 %{
@@ -5126,6 +5139,7 @@ 
   ins_pipe(pipe_class_memory);
 %}
 
+
 instruct storeimmB0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreB mem zero));
diff -r 939480aaf1b2 -r 53205a277e07 src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp	Tue Mar 11 15:44:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp	Tue Mar 18 10:43:49 2014 +0000
@@ -542,14 +542,46 @@ 
 /////////////////////////////////////////////////////////////////////////////
 #if INCLUDE_ALL_GCS
 
-void G1PreBarrierStub::emit_code(LIR_Assembler* ce) { Unimplemented(); }
+void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
+  // At this point we know that marking is in progress.
+  // If do_load() is true then we have to emit the
+  // load of the previous value; otherwise it has already
+  // been loaded into _pre_val.
+
+  __ bind(_entry);
+  assert(pre_val()->is_register(), "Precondition.");
+
+  Register pre_val_reg = pre_val()->as_register();
+
+  if (do_load()) {
+    ce->mem2reg(addr(), pre_val(), T_OBJECT, patch_code(), info(), false /*wide*/, false /*unaligned*/);
+  }
+  __ cbz(pre_val_reg, _continuation);
+  ce->store_parameter(pre_val()->as_register(), 0);
+  __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_pre_barrier_slow_id)));
+  __ b(_continuation);
+}
 
 jbyte* G1PostBarrierStub::_byte_map_base = NULL;
 
-jbyte* G1PostBarrierStub::byte_map_base_slow() { Unimplemented(); return 0; }
+jbyte* G1PostBarrierStub::byte_map_base_slow() {
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  assert(bs->is_a(BarrierSet::G1SATBCTLogging),
+         "Must be if we're using this.");
+  return ((G1SATBCardTableModRefBS*)bs)->byte_map_base;
+}
 
 
-void G1PostBarrierStub::emit_code(LIR_Assembler* ce) { Unimplemented(); }
+void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  assert(addr()->is_register(), "Precondition.");
+  assert(new_val()->is_register(), "Precondition.");
+  Register new_val_reg = new_val()->as_register();
+  __ cbz(new_val_reg, _continuation);
+  ce->store_parameter(addr()->as_pointer_register(), 0);
+  __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_post_barrier_slow_id)));
+  __ b(_continuation);
+}
 
 #endif // INCLUDE_ALL_GCS
 /////////////////////////////////////////////////////////////////////////////
diff -r 939480aaf1b2 -r 53205a277e07 src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Mar 11 15:44:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Mar 18 10:43:49 2014 +0000
@@ -1866,47 +1866,47 @@ 
 void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst) {
   
   assert(left->is_single_cpu() || left->is_double_cpu(), "expect single or double register");
-  if (left->is_single_cpu()) {
-    assert (right->is_single_cpu() || right->is_constant(), "single register or constant expected");
-    if (right->is_constant()
-	&& Assembler::operand_valid_for_logical_immediate(true, right->as_jint())) {
-
-      switch (code) {
-      case lir_logic_and: __ andw (dst->as_register(), left->as_register(), right->as_jint()); break;
-      case lir_logic_or:  __ orrw (dst->as_register(), left->as_register(), right->as_jint()); break;
-      case lir_logic_xor: __ eorw (dst->as_register(), left->as_register(), right->as_jint()); break;
-      default: ShouldNotReachHere(); break;
-      }
-    } else {
-      switch (code) {
-      case lir_logic_and: __ andw (dst->as_register(), left->as_register(), right->as_register()); break;
-      case lir_logic_or:  __ orrw (dst->as_register(), left->as_register(), right->as_register()); break;
-      case lir_logic_xor: __ eorw (dst->as_register(), left->as_register(), right->as_register()); break;
-      default: ShouldNotReachHere(); break;
-      }
-    }
-  } else {
-    assert (right->is_double_cpu() || right->is_constant(), "single register or constant expected");
-    if (right->is_double_cpu()) {
-      switch (code) {
-      case lir_logic_and: __ andr(dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
-      case lir_logic_or:  __ orr (dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
-      case lir_logic_xor: __ eor (dst->as_register_lo(), left->as_register_lo(), right->as_register_lo()); break;
-      default:
-	ShouldNotReachHere();
-	break;
-      }
-    } else {
-      switch (code) {
-      case lir_logic_and: __ andr(dst->as_register_lo(), left->as_register_lo(), right->as_jlong()); break;
-      case lir_logic_or:  __ orr (dst->as_register_lo(), left->as_register_lo(), right->as_jlong()); break;
-      case lir_logic_xor: __ eor (dst->as_register_lo(), left->as_register_lo(), right->as_jlong()); break;
-      default:
-	ShouldNotReachHere();
-	break;
-      }
-    }
-  }
+  Register Rleft = left->is_single_cpu() ? left->as_register() :
+                                           left->as_register_lo();
+   if (dst->is_single_cpu()) {
+     Register Rdst = dst->as_register();
+     if (right->is_constant()) {
+       switch (code) {
+         case lir_logic_and: __ andw (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_or:  __ orrw (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_xor: __ eorw (Rdst, Rleft, right->as_jint()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       Register Rright = right->is_single_cpu() ? right->as_register() :
+                                                  right->as_register_lo();
+       switch (code) {
+         case lir_logic_and: __ andw (Rdst, Rleft, Rright); break;
+         case lir_logic_or:  __ orrw (Rdst, Rleft, Rright); break;
+         case lir_logic_xor: __ eorw (Rdst, Rleft, Rright); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   } else {
+     Register Rdst = dst->as_register_lo();
+     if (right->is_constant()) {
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, right->as_jlong()); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, right->as_jlong()); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, right->as_jlong()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       Register Rright = right->is_single_cpu() ? right->as_register() :
+                                                  right->as_register_lo();
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, Rright); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, Rright); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, Rright); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   }
 }
 
 
diff -r 939480aaf1b2 -r 53205a277e07 src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Tue Mar 11 15:44:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Tue Mar 18 10:43:49 2014 +0000
@@ -42,6 +42,9 @@ 
 #include "runtime/vframe.hpp"
 #include "runtime/vframeArray.hpp"
 #include "vmreg_aarch64.inline.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#endif
 
 
 // Implementation of StubAssembler
@@ -1148,6 +1151,133 @@ 
       }
       break;
 
+#if INCLUDE_ALL_GCS
+    case g1_pre_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_pre_barrier", dont_gc_arguments);
+        // arg0 : previous value of memory
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        if (bs->kind() != BarrierSet::G1SATBCTLogging) {
+	  __ mov(r0, (int)id);
+	  __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), r0);
+	  __ should_not_reach_here();
+          break;
+        }
+
+        const Register pre_val = r0;
+        const Register thread = rthread;
+        const Register tmp = rscratch1;
+
+        Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_active()));
+
+        Address queue_index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+        Label done;
+        Label runtime;
+
+	//__ push(r0->bit(1) | r1->bit(1), sp);
+	__ push(r0->bit(1) | r1->bit(1) | rscratch1->bit(1) | rscratch2->bit(1), sp);
+        // Can we store original value in the thread's buffer?
+        f.load_argument(0, pre_val);
+        __ ldr(tmp, queue_index);
+        __ cbz(tmp, runtime);
+
+        __ sub(tmp, tmp, wordSize);
+        __ str(tmp, queue_index);
+        __ ldr(rscratch2, buffer);
+        __ add(tmp, tmp, rscratch2);
+        __ str(pre_val, Address(tmp, 0));
+        __ b(done);
+
+        __ bind(runtime);
+        __ push(0xfc, sp);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
+	__ pop(0xfc, sp);
+        __ bind(done);
+	//__ pop(r0->bit(1) | r1->bit(1), sp);
+	__ pop(r0->bit(1) | r1->bit(1) | rscratch1->bit(1) | rscratch2->bit(1), sp);
+      }
+      break;
+    case g1_post_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_post_barrier", dont_gc_arguments);
+
+        // arg0: store_address
+        Address store_addr(rfp, 2*BytesPerWord);
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+        assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+        Label done;
+        Label runtime;
+
+        // At this point we know new_value is non-NULL and the new_value crosses regions.
+        // Must check to see if card is already dirty
+
+        const Register thread = rthread;
+
+        Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+        const Register card_addr = rscratch2;
+
+	//__ push(r0->bit(1) | r1->bit(1), sp);
+	__ push(r0->bit(1) | r1->bit(1) | rscratch1->bit(1) | rscratch2->bit(1), sp);
+        f.load_argument(0, card_addr);
+        __ lsr(card_addr, card_addr, CardTableModRefBS::card_shift);
+        // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
+        // a valid address and therefore is not properly handled by the relocation code.
+	__ mov(rscratch1, (intptr_t)ct->byte_map_base);
+        __ add(card_addr, card_addr, rscratch1);
+        __ ldrb(rscratch1, Address(card_addr, 0));
+        __ cmpw(rscratch1, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+	__ br(Assembler::EQ, done);
+
+        __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
+        __ ldrb(rscratch1, Address(card_addr, 0));
+        __ cmpw(rscratch1, (int)CardTableModRefBS::dirty_card_val());
+	__ br(Assembler::EQ, done);
+
+        // storing region crossing non-NULL, card is clean.
+        // dirty card and log.
+
+        __ mov(rscratch1, (int)CardTableModRefBS::dirty_card_val());
+        __ strb(rscratch1, Address(card_addr, 0));
+
+        __ ldr(rscratch1, queue_index);
+        __ cbz(rscratch1, runtime);
+        __ sub(rscratch1, rscratch1, wordSize);
+        __ str(rscratch1, queue_index);
+
+        const Register buffer_addr = rscratch2;
+
+	__ push(card_addr->bit(1), sp);
+	__ ldr(buffer_addr, buffer);
+	__ add(rscratch1, buffer_addr, rscratch1);
+	__ pop(card_addr->bit(1), sp);
+	__ str(card_addr, Address(rscratch1, 0));
+	__ b(done);
+
+        __ bind(runtime);
+	__ push(0xfc, sp);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+	__ pop(0xfc, sp);
+        __ bind(done);
+	//__ pop(r0->bit(1) | r1->bit(1), sp);
+	__ pop(r0->bit(1) | r1->bit(1) | rscratch1->bit(1) | rscratch2->bit(1), sp);
+
+      }
+      break;
+#endif
+
     case predicate_failed_trap_id:
       {
         StubFrame f(sasm, "predicate_failed_trap", dont_gc_arguments);
diff -r 939480aaf1b2 -r 53205a277e07 src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Mar 11 15:44:21 2014 +0000
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Mar 18 10:43:49 2014 +0000
@@ -47,11 +47,12 @@ 
 // #include "runtime/os.hpp"
 // #include "runtime/sharedRuntime.hpp"
 // #include "runtime/stubRoutines.hpp"
-// #if INCLUDE_ALL_GCS
-// #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
-// #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
-// #include "gc_implementation/g1/heapRegion.hpp"
-// #endif
+
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
+#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc_implementation/g1/heapRegion.hpp"
+#endif
 
 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
@@ -2409,13 +2410,174 @@ 
                                           Register thread,
                                           Register tmp,
                                           bool tosca_live,
-                                          bool expand_call) { Unimplemented(); }
+                                          bool expand_call) {
+  // If expand_call is true then we expand the call_VM_leaf macro
+  // directly to skip generating the check by
+  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
+
+#ifdef _LP64
+  assert(thread == rthread, "must be");
+#endif // _LP64
+
+  Label done;
+  Label runtime;
+
+  assert(pre_val != noreg, "check this code");
+
+  if (obj != noreg)
+    assert_different_registers(obj, pre_val, tmp);
+
+  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_active()));
+  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+
+
+  // Is marking active?
+  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
+    ldrw(tmp, in_progress);
+  } else {
+    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
+    ldrb(tmp, in_progress);
+  }
+  cbzw(tmp, done);
+
+  // Do we need to load the previous value?
+  if (obj != noreg) {
+    load_heap_oop(pre_val, Address(obj, 0));
+  }
+
+  // Is the previous value null?
+  cbz(pre_val, done);
+
+  // Can we store original value in the thread's buffer?
+  // Is index == 0?
+  // (The index field is typed as size_t.)
+
+  ldr(tmp, index);                      // tmp := *index_adr
+  cbz(tmp, runtime);                    // tmp == 0?
+                                        // If yes, goto runtime
+
+  sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
+  str(tmp, index);                      // *index_adr := tmp
+  ldr(rscratch1, buffer);
+  add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
+
+  // Record the previous value
+  str(pre_val, Address(tmp, 0));
+  b(done);
+
+  bind(runtime);
+  // save the live input values
+  push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
+
+  // Calling the runtime using the regular call_VM_leaf mechanism generates
+  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
+  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
+  //
+  // If we care generating the pre-barrier without a frame (e.g. in the
+  // intrinsified Reference.get() routine) then ebp might be pointing to
+  // the caller frame and so this check will most likely fail at runtime.
+  //
+  // Expanding the call directly bypasses the generation of the check.
+  // So when we do not have have a full interpreter frame on the stack
+  // expand_call should be passed true.
+
+  if (expand_call) {
+    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
+    pass_arg1(this, thread);
+    pass_arg0(this, pre_val);
+    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
+  } else {
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
+  }
+
+  pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
+
+  bind(done);
+}
 
 void MacroAssembler::g1_write_barrier_post(Register store_addr,
                                            Register new_val,
                                            Register thread,
                                            Register tmp,
-                                           Register tmp2) { Unimplemented(); }
+                                           Register tmp2) {
+#ifdef _LP64
+  assert(thread == rthread, "must be");
+#endif // _LP64
+
+  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+  Label done;
+  Label runtime;
+
+  // Does store cross heap regions?
+
+  mov(tmp, store_addr);
+  eor(tmp, tmp, new_val);
+  lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
+  cbz(tmp, done);
+
+  // crosses regions, storing NULL?
+
+  cbz(new_val, done);
+
+  // storing region crossing non-NULL, is card already dirty?
+
+  ExternalAddress cardtable((address) ct->byte_map_base);
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+  const Register card_addr = tmp;
+
+  mov(card_addr, store_addr);
+  lsr(card_addr, card_addr, CardTableModRefBS::card_shift);
+
+  unsigned long offset;
+  adrp(tmp2, cardtable, offset);
+
+  // get the address of the card
+  add(card_addr, card_addr, tmp2);
+  ldrb(tmp2, Address(card_addr, offset));
+  cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+  br(Assembler::EQ, done);
+
+  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
+  ldrb(tmp2, Address(card_addr, offset));
+  cmpw(tmp2, (int)CardTableModRefBS::dirty_card_val());
+  br(Assembler::EQ, done);
+
+  // storing a region crossing, non-NULL oop, card is clean.
+  // dirty card and log.
+
+  mov(tmp2, (int)CardTableModRefBS::dirty_card_val());
+  strb(tmp2, Address(card_addr, offset));
+
+  ldr(rscratch1, queue_index);
+  cbz(rscratch1, runtime);
+  sub(rscratch1, rscratch1, wordSize);
+  str(rscratch1, queue_index);
+
+  ldr(tmp2, buffer);
+  add(tmp2, tmp2, rscratch1);
+  str(card_addr, Address(tmp2, 0));
+  b(done);
+
+  bind(runtime);
+  // save the live input values
+  push(store_addr->bit(true) | new_val->bit(true), sp);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+  pop(store_addr->bit(true) | new_val->bit(true), sp);
+
+  bind(done);
+}
 
 #endif // INCLUDE_ALL_GCS

--- CUT HERE ---