diff mbox

[v2] aarch64: Add split-stack initial support

Message ID 73b35310-87fe-7a11-241d-eacbcd08c6b0@linaro.org
State New
Headers show

Commit Message

Adhemerval Zanella Nov. 25, 2016, 5:40 p.m. UTC
On 15/11/2016 16:38, Wilco Dijkstra wrote:
> 

> On 07/11/2016 16:59, Adhemerval Zanella wrote:

>> On 14/10/2016 15:59, Wilco Dijkstra wrote:

> 

>> There is no limit afaik on gold split stack allocation handling,

>> and I think one could be added for each backend (in the method

>> override require to implement it).

>>

>> In fact it is not really required to tie the nop generation with the

>> instruction generated by 'aarch64_internal_mov_immediate', it is

>> just a matter to simplify linker code.  

> 

> If there is no easy limit and you'll still require a nop, I think it is best then

> to emit mov N+movk #0. Then the scheduler won't be able to reorder

> them with the add/sub.


Good call, I have changed the patch to emit a mov N+mov #0 instead
on relying the emit_nop.

> 

>>> Is there any need to detect underflow of x10 or is there a guarantee that stacks are

>>> never allocated in the low 2GB (given the maximum adjustment is 2GB)? It's safe

>>> to do a signed comparison.

>>

>> I do not think so, at least none of current backend that implements

>> split stack do so.

> 

> OK, well a signed comparison like in your new version works for underflow.

> 

> Now to the patch:

> 

> 

> @@ -3316,6 +3339,28 @@ aarch64_expand_prologue (void)

>    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,

>  			     callee_adjust != 0 || frame_pointer_needed);

>    aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);

> +

> +  if (split_stack_arg_pointer_used_p ())

> +    {

> +      /* Setup the argument pointer (x10) for -fsplit-stack code.  If

> +	 __morestack was called, it will left the arg pointer to the

> +	 old stack in x28.  Otherwise, the argument pointer is the top

> +	 of current frame.  */

> +      rtx x11 = gen_rtx_REG (Pmode, R11_REGNUM);

> +      rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);

> +      rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);

> +

> +      rtx not_more = gen_label_rtx ();

> +

> +      rtx cmp = gen_rtx_fmt_ee (LT, VOIDmode, cc_reg, const0_rtx);

> +      rtx jump = emit_jump_insn (gen_condjump (cmp, cc_reg, not_more));

> +      JUMP_LABEL (jump) = not_more;

> +      LABEL_NUSES (not_more) += 1;

> +

> +      emit_move_insn (x11, x28);

> +

> +      emit_label (not_more);

> +    }

> 

> If you pass the old sp in x11 when called from __morestack you can remove

> the above thunk completely.


Indeed this snippet does not make sense anymore, I removed it.

> 

> +  /* It limits total maximum stack allocation on 2G so its value can be

> +     materialized using two instructions at most (movn/movk).  It might be

> +     used by the linker to add some extra space for split calling non split

> +     stack functions.  */

> +  allocate = cfun->machine->frame.frame_size;

> +  if (allocate > ((HOST_WIDE_INT) 1 << 31))

> +    {

> +      sorry ("Stack frame larger than 2G is not supported for -fsplit-stack");

> +      return;

> +    }

> 

> Note a 2-instruction mov/movk can generate any immediate up to 4GB and if

> we need even large sizes, we could round up to a multiple of 64KB so that 2

> instructions are enough for a 48-bit stack size...


I think we can set a limit of 4GB (powerpc64 backend limits to 2GB and
it seems fine).  I corrected the comment.

> 

> +  int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate),

> +					      true, Pmode);

> +  gcc_assert (ninsn == 1 || ninsn == 2);

> +  if (ninsn == 1)

> +    emit_insn (gen_nop ());

> 

> To avoid any issues with the nop being scheduled, it's best to emit an explicit movk

> here (0xffff0000 if allocate > 0, or 0 if zero) using gen_insv_immdi.


Right, I changed to that.

> 

> +void

> +aarch64_split_stack_space_check (rtx size, rtx label)

> 

> Isn't very similar code used in aarch64_expand_split_stack_prologue? Any possibility

> to share/reuse?


I though about it, but it would require split in two subparts (one
to load __private_ss from TCB and another to jump to __morestack)
and both are basically 4 lines.  In the end I think current approach
should be simpler.

> 

> +static void

> +aarch64_live_on_entry (bitmap regs)

> +{

> +  if (flag_split_stack)

> +    bitmap_set_bit (regs, R11_REGNUM);

> +}

> 

> I'm wondering whether you need extra code in aarch64_can_eliminate to deal

> with the argument pointer? Also do we need to define a fixed register, or will GCC

> automatically allocate it to a callee-save if necessary?


Now that you asked I think we can get rid of this live marking.
I used as base for initial patch the powerpc backend and modelled
the argument pointer usage for aarch64 based on it.  But after the
patch iterations and they way register is set I think it is safe
to remove it this constraint.

> 

> +++ b/libgcc/config/aarch64/morestack.S

> 

> +/* Offset from __morestack frame where the arguments size saved and

> +   passed to __generic_morestack.  */

> +#define ARGS_SIZE_SAVE		80

> 

> This define is unused.


Ack, I removed  it.

> 

> +# The normal function prologue follows here, with a small addition at the

> +# end to set up the argument pointer if required (the prolog):

> +#

> +#       [...]                  # default function prologue

> +#	b.lt   function:

> +#	mov    x11, x28

> 

> We don't need this if we pass sp in x11 when calling back to the original function.


Indeed, I removed it.

> 

> +	stp 	x8, x10, [sp, 80]

> +	stp	x11, x12, [sp, 96]

> 

> No need to save x11 - it just contains original sp.


Ack.

> 

> +	str	x28, [sp, 112]

> +	.cfi_offset 28, -112

> +

> +	# Setup on x28 the function initial frame pointer.

> +	add	x28, sp, MORESTACK_FRAMESIZE

> 

> Why save x28 when x28 = x29 + MORESTACK_FRAMESIZE? You can use x29

> throughout the code as it is preserved by calls.


Indeed, I refactored morestack.S to remove the x28 save/restore/usage
and used x29 instead.

> 

> +	# Start using new stack

> +	str	x29, [x0, -16]!

> 

> This has no use.


Ack, I removed it.

> 

> +	mov	sp, x0

> +

> +	# Set __private_ss stack guard for the new stack.

> +	ldr	x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]

> +	add	x0, x0, BACKOFF

> 

> +	sub	x0, x0, 16

> 

> Neither has this.


Ack.

> 

> +	ldp	x11, x12, [x28, STACKFRAME_BASE + 96]

> +	# Indicate __morestack was called.

> +	cmp	x12, 0

> +	blr	x12

> 

> There is no need to restore x11, all we need to do is restore x12 and branch:


Ack.

> 

> ldr x12, [x28, STACKFRAME_BASE + ...]

> add x11, x29, MORESTACK_FRAMESIZE

> blx x12

> 

> +	# Use old stack again.

> +	#sub	sp, x28, 16

> +	mov	sp, x28

> 

> Use:

> 

> add sp, x29, MORESTACK_FRAMESIZE


Ack.

> 

> +	ldp	x0, x1, [x28, STACKFRAME_BASE + 16]

> +	ldp	x2, x3, [x28, STACKFRAME_BASE + 32]

> +	ldp	x4, x5, [x28, STACKFRAME_BASE + 48]

> +	ldp	x6, x7, [x28, STACKFRAME_BASE + 64]

> +	ldp	x29, x30, [x28, STACKFRAME_BASE]

> +	ldr	x28, [x28, STACKFRAME_BASE + 112]

> +

> +	.cfi_remember_state

> +	.cfi_restore 30

> +	.cfi_restore 29

> +	.cfi_restore 28

> +	.cfi_def_cfa 31, 0

> 

> This needs to restore x29/x30 last to get correct unwinding:

> 

> ldp	x29, x30, [sp], MORESTACK_FRAMESIZE

> 	.cfi_remember_state

>         .cfi_restore 30

> 	.cfi_restore 29

> 	.cfi_def_cfa 31, 0

> 


Ack, I changed to this snippet.

Thanks again for the review, in attachments it is an updated version for
the patch.  I am now just struggling with an elusive regression (pprof_test.go)
and with this one fixed I think the whole patch should be ok.

Comments

Wilco Dijkstra Jan. 3, 2017, 3:13 p.m. UTC | #1
Adhemerval Zanella wrote:
  
Sorry for the late reply - but I think it's getting there. A few more comments:

+  /* If function uses stacked arguments save the old stack value so morestack
+     can return it.  */
+  reg11 = gen_rtx_REG (Pmode, R11_REGNUM);
+  if (cfun->machine->frame.saved_regs_size
+      || cfun->machine->frame.saved_varargs_size)
+    emit_move_insn (reg11, stack_pointer_rtx);

This doesn't look right - we could have many arguments even without varargs or
saved regs.  This would need to check varargs as well as ctrl->args.size (I believe
that is the size of the arguments on the stack). It's fine to omit this optimization
in the first version - we already emit 2-3 extra instructions for the check anyway.


+void
+aarch64_split_stack_space_check (rtx size, rtx label)
{
+  rtx mem, ssvalue, cc, cmp, jump, temp;
+  rtx requested = gen_reg_rtx (Pmode);
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);

ssvalue doesn't need to be a hardcoded register.

+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  temp = gen_rtx_REG (Pmode, R10_REGNUM);
+
+  /* And compare it with frame pointer plus required stack.  */
+  size = force_reg (Pmode, size);
+  emit_move_insn (requested, gen_rtx_MINUS (Pmode, stack_pointer_rtx, size));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  cc = aarch64_gen_compare_reg (LT, temp, ssvalue);

This uses X10, but where is it set???

+  cmp = gen_rtx_fmt_ee (GEU, VOIDmode, cc, const0_rtx);
+  jump = emit_jump_insn (gen_condjump (cmp, cc, label));
+  JUMP_LABEL (jump) = label;
+}

So neither X10 nor X12 are set before potentially calling __morestack, so I don't
think it will work. Could this be causing the crash you mentioned?

Wilco
Adhemerval Zanella Jan. 24, 2017, 6:05 p.m. UTC | #2
On 03/01/2017 13:13, Wilco Dijkstra wrote:
> Adhemerval Zanella wrote:

>   

> Sorry for the late reply - but I think it's getting there. A few more comments:


No worries.

> 

> +  /* If function uses stacked arguments save the old stack value so morestack

> +     can return it.  */

> +  reg11 = gen_rtx_REG (Pmode, R11_REGNUM);

> +  if (cfun->machine->frame.saved_regs_size

> +      || cfun->machine->frame.saved_varargs_size)

> +    emit_move_insn (reg11, stack_pointer_rtx);

> 

> This doesn't look right - we could have many arguments even without varargs or

> saved regs.  This would need to check varargs as well as ctrl->args.size (I believe

> that is the size of the arguments on the stack). It's fine to omit this optimization

> in the first version - we already emit 2-3 extra instructions for the check anyway.


I will check for a better solution.

> 

> 

> +void

> +aarch64_split_stack_space_check (rtx size, rtx label)

> {

> +  rtx mem, ssvalue, cc, cmp, jump, temp;

> +  rtx requested = gen_reg_rtx (Pmode);

> +  /* Offset from thread pointer to __private_ss.  */

> +  int psso = 0x10;

> +

> +  /* Load __private_ss from TCB.  */

> +  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);

> 

> ssvalue doesn't need to be a hardcoded register.


Indeed, and it seems that this was not being triggered. I have fixed it in
this version.

> 

> +  emit_insn (gen_aarch64_load_tp_hard (ssvalue));

> +  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));

> +  emit_move_insn (ssvalue, mem);

> +

> +  temp = gen_rtx_REG (Pmode, R10_REGNUM);

> +

> +  /* And compare it with frame pointer plus required stack.  */

> +  size = force_reg (Pmode, size);

> +  emit_move_insn (requested, gen_rtx_MINUS (Pmode, stack_pointer_rtx, size));

> +

> +  /* Jump to __morestack call if current __private_ss is not suffice.  */

> +  cc = aarch64_gen_compare_reg (LT, temp, ssvalue);

> 

> This uses X10, but where is it set???


I fixed it on this version.

> 

> +  cmp = gen_rtx_fmt_ee (GEU, VOIDmode, cc, const0_rtx);

> +  jump = emit_jump_insn (gen_condjump (cmp, cc, label));

> +  JUMP_LABEL (jump) = label;

> +}

> 

> So neither X10 nor X12 are set before potentially calling __morestack, so I don't

> think it will work. Could this be causing the crash you mentioned?


I do not think so, the issue in with the runtime/pprof libgo test that fails with

[Switching to LWP 18926]
0x000000000050c358 in runtime.sigtrampgo (sig=sig@entry=27, info=info@entry=0x7fb63d5da0, ctx=ctx@entry=0x7fb63d5e20)
    at ../../../gcc-git/libgo/go/runtime/signal_unix.go:221
221             setg(g.m.gsignal)

Where g.m is null.  Trying to obtain a stackstrace I am not seeing:

(gdb) bt
#0  0x000000000050c358 in runtime.sigtrampgo (sig=sig@entry=27, info=info@entry=0x7fb63d5da0, ctx=ctx@entry=0x7fb63d5e20)
    at ../../../gcc-git/libgo/go/runtime/signal_unix.go:221
#1  0x000000000056acb4 in runtime.sigtramp (sig=27, info=0x7fb63d5da0, context=0x7fb63d5e20) at ../../../gcc-git/libgo/runtime/go-signal.c:131
#2  <signal handler called>
#3  pprof_test.cpuHog1 () at pprof_test.go:52
#4  0x000000000040c814 in pprof_test.cpuHogger (f=f@entry=0x57c560 <runtime_pprof_test.cpuHog1$descriptor>, dur=<optimized out>) at pprof_test.go:37
#5  0x000000000040c9f8 in pprof_test.$nested1 (dur=<optimized out>) at pprof_test.go:75
#6  0x000000000040d038 in pprof_test.testCPUProfile (t=t@entry=0x420804e680, need=..., f=f@entry=0x57c600 <runtime_pprof_test.$nested1$descriptor>)
    at pprof_test.go:144
#7  0x000000000040c9a8 in runtime_pprof_test.TestCPUProfile (t=0x420804e680) at pprof_test.go:74
#8  0x0000000000543bec in testing.tRunner (param=<optimized out>, fn=<optimized out>) at ../../../gcc-git/libgo/go/testing/testing.go:656
#9  0x0000000000543c84 in testing.$thunk24 (__go_thunk_parameter=<optimized out>) at ../../../gcc-git/libgo/go/testing/testing.go:693
#10 0x000000000041a7dc in kickoff () at ../../../gcc-git/libgo/runtime/proc.c:258
/build/buildd/gdb-7.9/gdb/dwarf2-frame.c:1732: internal-error: add_cie: Assertion `n < 1 || cie_table->entries[n - 1]->cie_pointer < cie->cie_pointer' failed.
A problem internal to GDB has been detected,
further debugging may prove unreliable.
Quit this debugging session? (y or n)

Which maybe the case that morestack.S unwind info not really correct. It could be
a case for issue in gdb as well (I will check with a newer gdb).


> 

> Wilco

>From 09b51fb706a8b15ecaea4ec4b8a80d0a7903053d Mon Sep 17 00:00:00 2001

From: Adhemerval Zanella <adhemerval.zanella@linaro.org>

Date: Wed, 4 May 2016 21:13:39 +0000
Subject: [PATCH] aarch64: Add split-stack initial support

This patch adds the split-stack support on aarch64 (PR #67877).  As for
other ports this patch should be used along with glibc and gold support.

The support is done similar to other architectures: a __private_ss field is
added on TCB in glibc, a target-specific __morestack implementation and
helper functions are added in libgcc and compiler supported in adjusted
(split-stack prologue, va_start for argument handling).  I also plan to
send the gold support to adjust stack allocation acrosss split-stack
and default code calls.

Current approach is similar to powerpc one: at most 2 GB of stack allocation
is support so stack adjustments can be done with 2 instructions (either just
a movn plus nop or a movn followed by movk).  The morestack call is non
standard with x10 hollding the requested stack pointer, x11 the argument
pointer, and x12 to return continuation address.  Unwinding is handled by a
personality routine that knows how to find stack segments.

Split-stack prologue on function entry is as follow (this goes before the
usual function prologue):

function:
	mrs    x9, tpidr_el0
	mov    x10, -<required stack allocation>
	movk   0x0
	add    x10, sp, x10
	mov    x11, sp   	# if function has stacked arguments
	adrp   x12, main_fn_entry
	add    x12, x12, :lo12:.L2
	cmp    x9, x10
	b.lt   <main_fn_entry>
	b      __morestack
main_fn_entry:
	[function prologue]

Notes:

1. Even if a function does not allocate a stack frame, a split-stack prologue
   is created.  It is to avoid issues with tail call for external symbols
   which might require linker adjustment (libgo/runtime/go-varargs.c).

2. Basic-block reordering (enabled with -O2) will move split-stack TCB ldr
   to after the required stack calculation.

3. Similar to powerpc, When the linker detects a call from split-stack to
   non-split-stack code, it adds 16k (or more) to the value found in "allocate"
   instructions (so non-split-stack code gets a larger stack).  The amount is
   tunable by a linker option.  The edit means aarch64 does not need to
   implement __morestack_non_split, necessary on x86 because insufficient
   space is available there to edit the stack comparison code.  This feature
   is only implemented in the GNU gold linker.

4. AArch64 does not handle >4G stack initially and although it is possible
   to implement it, limiting to 4G allows to materize the allocation with
   only 2 instructions (mov + movk) and thus simplifying the linker
   adjustments required.  Supporting multiple threads each requiring more
   than 4G of stack is probably not that important, and likely to OOM at
   run time.

5. The TCB support on GLIBC is meant to be included in version 2.25.

6. The continuation address materialized on x12 is done using 'adrp'
   plus add and a static relocation.  Current code uses the
   aarch64_expand_mov_immediate function and since a better alternative
   would be 'adp', it could be a future optimization (not implemented
   in this patch).

libgcc/ChangeLog:

	* libgcc/config.host: Use t-stack and t-statck-aarch64 for
	aarch64*-*-linux.
	* libgcc/config/aarch64/morestack-c.c: New file.
	* libgcc/config/aarch64/morestack.S: Likewise.
	* libgcc/config/aarch64/t-stack-aarch64: Likewise.
	* libgcc/generic-morestack.c (__splitstack_find): Add aarch64-specific
	code.

gcc/ChangeLog:

	* common/config/aarch64/aarch64-common.c
	(aarch64_supports_split_stack): New function.
	(TARGET_SUPPORTS_SPLIT_STACK): New macro.
	* gcc/config/aarch64/aarch64-linux.h (TARGET_ASM_FILE_END): Remove
	macro.
	* gcc/config/aarch64/aarch64-protos.h: Add
	aarch64_expand_split_stack_prologue and
	aarch64_split_stack_space_check.
	* gcc/config/aarch64/aarch64.c (aarch64_expand_prologue): Setup the
	argument pointer (x10) for split-stack.
	(aarch64_expand_builtin_va_start): Use internal argument pointer
	instead of virtual_incoming_args_rtx.
	(morestack_ref): New symbol.
	(aarch64_expand_split_stack_prologue): New function.
	(aarch64_file_end): Emit the split-stack note sections.
	(aarch64_internal_arg_pointer): Likewise.
	(aarch64_live_on_entry): Set the argument pointer for split-stack.
	(aarch64_split_stack_space_check): Likewise.
	(TARGET_ASM_FILE_END): New macro.
	(TARGET_EXTRA_LIVE_ON_ENTRY): Likewise.
	(TARGET_INTERNAL_ARG_POINTER): Likewise.
	* gcc/config/aarch64/aarch64.h (aarch64_frame): Add
	split_stack_arg_pointer to setup the argument pointer when using
	split-stack.
	* gcc/config/aarch64/aarch64.md (UNSPEC_STACK_CHECK): New unspec.
	(UNSPECV_SPLIT_STACK_RETURN): Likewise.
	(split_stack_prologue): New expand.
	(split_stack_space_check): Likewise.
	(split_stack_cond_call): New expand.
---
 gcc/common/config/aarch64/aarch64-common.c |  16 +-
 gcc/config/aarch64/aarch64-linux.h         |   2 -
 gcc/config/aarch64/aarch64-protos.h        |   2 +
 gcc/config/aarch64/aarch64.c               | 174 +++++++++++++++++++-
 gcc/config/aarch64/aarch64.h               |   3 +
 gcc/config/aarch64/aarch64.md              |  57 +++++++
 libgcc/config.host                         |   1 +
 libgcc/config/aarch64/morestack-c.c        |  95 +++++++++++
 libgcc/config/aarch64/morestack.S          | 254 +++++++++++++++++++++++++++++
 libgcc/config/aarch64/t-stack-aarch64      |   3 +
 libgcc/generic-morestack.c                 |   1 +
 11 files changed, 603 insertions(+), 5 deletions(-)
 create mode 100644 libgcc/config/aarch64/morestack-c.c
 create mode 100644 libgcc/config/aarch64/morestack.S
 create mode 100644 libgcc/config/aarch64/t-stack-aarch64

diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
index a0b7f48..286f0c6 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -107,6 +107,21 @@ aarch64_handle_option (struct gcc_options *opts,
     }
 }
 
+/* -fsplit-stack uses a TCB field available on glibc-2.25.  GLIBC also
+   exports symbol, __tcb_private_ss, to signal it has the field available
+   on TCB allocation.  This aims to prevent binaries linked against newer
+   GLIBC to run on non-supported ones.  */
+
+static bool
+aarch64_supports_split_stack (bool report ATTRIBUTE_UNUSED,
+			      struct gcc_options *opts ATTRIBUTE_UNUSED)
+{
+  return true;
+}
+
+#undef TARGET_SUPPORTS_SPLIT_STACK
+#define TARGET_SUPPORTS_SPLIT_STACK aarch64_supports_split_stack
+
 struct gcc_targetm_common targetm_common = TARGETM_COMMON_INITIALIZER;
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -340,4 +355,3 @@ aarch64_rewrite_mcpu (int argc, const char **argv)
 }
 
 #undef AARCH64_CPU_NAME_LENGTH
-
diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
index c45fc1d..b8daba4 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -80,8 +80,6 @@
     }						\
   while (0)
 
-#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
-
 /* Uninitialized common symbols in non-PIE executables, even with
    strong definitions in dependent shared libraries, will resolve
    to COPY relocated symbol in the executable.  See PR65780.  */
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f55d4ba..bfb8b51 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -377,6 +377,8 @@ void aarch64_err_no_fpadvsimd (machine_mode, const char *);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx);
 void aarch64_expand_prologue (void);
+void aarch64_expand_split_stack_prologue (void);
+void aarch64_split_stack_space_check (rtx, rtx);
 void aarch64_expand_vector_init (rtx, rtx);
 void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
 				   const_tree, unsigned);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c3992d8..a993983 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -901,7 +901,12 @@ aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
     output_asm_insn (buffer, operands);
 
-    snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
+    if (GET_CODE (operands[pos_label]) == LABEL_REF)
+      snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label,
+		label_ptr);
+    else
+      snprintf (buffer, sizeof (buffer), "b\t%%%d\n%s:", pos_label,
+		label_ptr);
     operands[pos_label] = dest_label;
     output_asm_insn (buffer, operands);
     return "";
@@ -10116,7 +10121,7 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
   /* Emit code to initialize STACK, which points to the next varargs stack
      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
      by named arguments.  STACK is 8-byte aligned.  */
-  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
+  t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
   if (cum->aapcs_stack_size > 0)
     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
@@ -14674,6 +14679,165 @@ aarch64_excess_precision (enum excess_precision_type type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* -fsplit-stack support.  */
+
+/* A SYMBOL_REF for __morestack.  */
+static GTY(()) rtx morestack_ref;
+
+/* Emit -fsplit-stack prologue, which goes before the regular function
+   prologue.  */
+
+void
+aarch64_expand_split_stack_prologue (void)
+{
+  rtx ssvalue, mem;
+  rtx reg10, reg11, reg12, cc, cmp, jump;
+  HOST_WIDE_INT allocate;
+  rtx_code_label *ok_label = NULL;
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  gcc_assert (flag_split_stack && reload_completed);
+
+  /* It limits total maximum stack allocation on 4G so its value can be
+     materialized using two instructions at most (movn/movk).  It might be
+     used by the linker to add some extra space for split calling non split
+     stack functions.  */
+  allocate = cfun->machine->frame.frame_size;
+  if (allocate > ((int64_t)1 << 32))
+    {
+      sorry ("Stack frame larger than 4G is not supported for -fsplit-stack");
+      return;
+    }
+
+  if (morestack_ref == NULL_RTX)
+    {
+      morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+      SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
+					   | SYMBOL_FLAG_FUNCTION);
+    }
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  /* Always emit two insns to calculate the requested stack, so the linker
+     can edit them when adjusting size for calling non-split-stack code.  */
+  reg10 = gen_rtx_REG (Pmode, R10_REGNUM);
+  int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate),
+					      true, Pmode);
+  gcc_assert (ninsn == 1 || ninsn == 2);
+  if (ninsn == 1)
+    {
+      if (allocate > 0)
+	emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0xffff0000)));
+      else
+	emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0x0)));
+    }
+  emit_insn (gen_add3_insn (reg10, stack_pointer_rtx, reg10));
+
+  ok_label = gen_label_rtx ();
+
+  /* If function uses stacked arguments save the old stack value so morestack
+     can return it.  */
+  reg11 = gen_rtx_REG (Pmode, R11_REGNUM);
+  if (cfun->machine->frame.saved_regs_size
+      || cfun->machine->frame.saved_varargs_size)
+    emit_move_insn (reg11, stack_pointer_rtx);
+
+  /* x12 holds the continuation address used to return to function.  */
+  reg12 = gen_rtx_REG (Pmode, R12_REGNUM);
+  aarch64_expand_mov_immediate (reg12, gen_rtx_LABEL_REF (VOIDmode, ok_label));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  cc = aarch64_gen_compare_reg (GE, ssvalue, reg10);
+  cmp = gen_rtx_fmt_ee (GE, VOIDmode, cc, const0_rtx);
+  jump = gen_split_stack_cond_call (morestack_ref, cmp, ok_label, reg12);
+
+  aarch64_emit_unlikely_jump (jump);
+  JUMP_LABEL (jump) = ok_label;
+  LABEL_NUSES (ok_label)++;
+
+  /* __morestack will call us here.  */
+  emit_label (ok_label);
+}
+
+/* Implement TARGET_ASM_FILE_END.  */
+
+static void
+aarch64_file_end (void)
+{
+  file_end_indicate_exec_stack ();
+
+  if (flag_split_stack)
+    file_end_indicate_split_stack ();
+}
+
+/* Return the internal arg pointer used for function incoming arguments.  */
+
+static rtx
+aarch64_internal_arg_pointer (void)
+{
+  if (flag_split_stack
+     && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
+         == NULL))
+    {
+      if (cfun->machine->frame.split_stack_arg_pointer == NULL_RTX)
+	{
+	  rtx pat;
+
+	  cfun->machine->frame.split_stack_arg_pointer = gen_reg_rtx (Pmode);
+	  REG_POINTER (cfun->machine->frame.split_stack_arg_pointer) = 1;
+
+	  /* Put the pseudo initialization right after the note at the
+	     beginning of the function.  */
+	  pat = gen_rtx_SET (cfun->machine->frame.split_stack_arg_pointer,
+			     gen_rtx_REG (Pmode, R11_REGNUM));
+	  push_topmost_sequence ();
+	  emit_insn_after (pat, get_insns ());
+	  pop_topmost_sequence ();
+	}
+      return plus_constant (Pmode, cfun->machine->frame.split_stack_arg_pointer,
+			    FIRST_PARM_OFFSET (current_function_decl));
+    }
+  return virtual_incoming_args_rtx;
+}
+
+/* Emit -fsplit-stack dynamic stack allocation space check.  */
+
+void
+aarch64_split_stack_space_check (rtx size, rtx label)
+{
+  rtx mem, ssvalue, cc, cmp, jump, temp;
+  rtx reg10, reg11, reg12;
+  rtx requested = gen_reg_rtx (Pmode);
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_reg_rtx (Pmode);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  /* And compare it with frame pointer plus required stack.  */
+  reg10 = gen_rtx_REG (Pmode, R10_REGNUM);
+  size = force_reg (Pmode, size);
+  emit_move_insn (reg10, gen_rtx_MINUS (Pmode, stack_pointer_rtx, size));
+
+  /* x12 holds the continuation address used to return to function.  */
+  reg12 = gen_rtx_REG (Pmode, R12_REGNUM);
+  aarch64_expand_mov_immediate (reg12, gen_rtx_LABEL_REF (VOIDmode, label));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  cc = aarch64_gen_compare_reg (GE, ssvalue, reg10);
+  cmp = gen_rtx_fmt_ee (GE, VOIDmode, cc, const0_rtx);
+  jump = emit_jump_insn (gen_condjump (cmp, cc, label));
+  JUMP_LABEL (jump) = label;
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -14746,6 +14910,9 @@ aarch64_run_selftests (void)
 #undef TARGET_ASM_FILE_START
 #define TARGET_ASM_FILE_START aarch64_start_file
 
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END aarch64_file_end
+
 #undef TARGET_ASM_OUTPUT_MI_THUNK
 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
 
@@ -14831,6 +14998,9 @@ aarch64_run_selftests (void)
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
 
+#undef TARGET_INTERNAL_ARG_POINTER
+#define TARGET_INTERNAL_ARG_POINTER aarch64_internal_arg_pointer
+
 #undef TARGET_GIMPLE_FOLD_BUILTIN
 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index e4fb96f..510f60a 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -594,6 +594,9 @@ struct GTY (()) aarch64_frame
   unsigned wb_candidate2;
 
   bool laid_out;
+
+  /* Alternative internal arg pointer for -fsplit-stack.  */
+  rtx split_stack_arg_pointer;
 };
 
 typedef struct GTY (()) machine_function
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a693a3b..d08b5fc 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -135,6 +135,7 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_STACK_CHECK
     UNSPEC_RSQRT
     UNSPEC_RSQRTE
     UNSPEC_RSQRTS
@@ -150,6 +151,7 @@
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
     UNSPECV_BLOCKAGE		; Represent a blockage
     UNSPECV_PROBE_STACK_RANGE	; Represent stack range probing.
+    UNSPECV_SPLIT_STACK_CALL    ; Represent a morestack call
   ]
 )
 
@@ -5538,3 +5540,58 @@
 
 ;; ldp/stp peephole patterns
 (include "aarch64-ldpstp.md")
+
+;; Handle -fsplit-stack
+(define_expand "split_stack_prologue"
+  [(const_int 0)]
+  ""
+{
+  aarch64_expand_split_stack_prologue ();
+  DONE;
+})
+
+;; If there are operand 0 bytes available on the stack, jump to
+;; operand 1.
+(define_expand "split_stack_space_check"
+  [(set (match_dup 2) (compare:CC (match_dup 3) (match_dup 2)))
+   (set (pc) (if_then_else
+	      (geu (match_dup 4) (const_int 0))
+	      (label_ref (match_operand 1))
+	      (pc)))]
+  ""
+{
+  aarch64_split_stack_space_check (operands[0], operands[1]);
+  DONE;
+})
+
+;; A __morestack call using branch
+
+(define_expand "split_stack_cond_call"
+  [(match_operand 0 "aarch64_call_insn_operand" "")
+   (match_operand 1 "" "")
+   (match_operand 2 "" "")
+   (match_operand 3 "" "")]
+  ""
+{
+  emit_jump_insn (gen_split_stack_cond_call_di (operands[0], operands[1],
+						operands[2], operands[3]));
+  DONE;
+})
+
+
+(define_insn "split_stack_cond_call_<mode>"
+  [(set (pc)
+        (if_then_else
+          (match_operand 1 "aarch64_comparison_operator" "")
+          (label_ref (match_operand 2 "" ""))
+          (pc)))
+   (set (reg:P 1) (unspec_volatile:P [(match_operand:P 0 "aarch64_call_insn_operand" "")
+                                    (reg:P 1)]
+                                   UNSPECV_SPLIT_STACK_CALL))
+   (use (match_operand:P 3 "register_operand" ""))]
+  ""
+  {
+    return aarch64_gen_far_branch (operands, 0, "Lbcond", "b%M1\\t");
+  }
+  [(set_attr "type" "branch")]
+)
diff --git a/libgcc/config.host b/libgcc/config.host
index 540bfa9..ef2bd84 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -344,6 +344,7 @@ aarch64*-*-linux*)
 	md_unwind_header=aarch64/linux-unwind.h
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+	tmake_file="${tmake_file} t-stack aarch64/t-stack-aarch64"
 	;;
 alpha*-*-linux*)
 	tmake_file="${tmake_file} alpha/t-alpha alpha/t-ieee t-crtfm alpha/t-linux"
diff --git a/libgcc/config/aarch64/morestack-c.c b/libgcc/config/aarch64/morestack-c.c
new file mode 100644
index 0000000..8df7895
--- /dev/null
+++ b/libgcc/config/aarch64/morestack-c.c
@@ -0,0 +1,95 @@
+/* AArch64 support for -fsplit-stack.
+ * Copyright (C) 2016 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef inhibit_libc
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include "generic-morestack.h"
+
+/* This is based on GLIBC definition (version 2.24).  There is no need to
+   keep it sync since new fields are added on the end of structure and do
+   not change the '__private_ss' layout.  */
+typedef struct
+{
+  void *dtv;
+  void *private;
+  void *__private_ss;
+} tcbhead_t;
+
+#define INITIAL_STACK_SIZE  0x4000
+#define BACKOFF             0x1000
+
+void __generic_morestack_set_initial_sp (void *sp, size_t len);
+void *__morestack_get_guard (void);
+void __morestack_set_guard (void *);
+void *__morestack_make_guard (void *stack, size_t size);
+void __morestack_load_mmap (void);
+
+/* We declare is as weak so it fails either at stack linking or
+   at runtime if the GLIBC does not have the required TCB field.  */
+extern void __tcb_private_ss (void) __attribute__ ((weak));
+
+/* Initialize the stack guard when the program starts or when a new
+   thread.  This is called from a constructor using ctors section.  */
+void
+__stack_split_initialize (void)
+{
+  __tcb_private_ss ();
+
+  register void* sp __asm__ ("sp");
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = (void*)((uintptr_t)sp - INITIAL_STACK_SIZE);
+  return __generic_morestack_set_initial_sp (sp, INITIAL_STACK_SIZE);
+}
+
+/* Return current __private_ss.  */
+void *
+__morestack_get_guard (void)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  return tcb->__private_ss;
+}
+
+/* Set __private_ss to ptr.  */
+void
+__morestack_set_guard (void *ptr)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = ptr;
+}
+
+/* Return the stack guard value for given stack.  */
+void *
+__morestack_make_guard (void *stack, size_t size)
+{
+  return (void*)((uintptr_t)stack - size + BACKOFF);
+}
+
+/* Make __stack_split_initialize a high priority constructor.  */
+static void (*const ctors []) 
+  __attribute__ ((used, section (".ctors.65535"), aligned (sizeof (void *))))
+  = { __stack_split_initialize, __morestack_load_mmap };
+
+#endif /* !defined (inhibit_libc) */
diff --git a/libgcc/config/aarch64/morestack.S b/libgcc/config/aarch64/morestack.S
new file mode 100644
index 0000000..aac488d
--- /dev/null
+++ b/libgcc/config/aarch64/morestack.S
@@ -0,0 +1,254 @@
+# AArch64 support for -fsplit-stack.
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This file is part of GCC.
+
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+/* Define an entry point visible from C.  */
+#define ENTRY(name)						\
+  .globl name;							\
+  .type name,%function;						\
+  .align 4;							\
+  name##:
+
+#define END(name)						\
+  .size name,.-name
+
+/* __morestack frame size.  */
+#define MORESTACK_FRAMESIZE	112
+/* Offset from __morestack frame where the new stack size is saved and
+   passed to __generic_morestack.  */
+#define NEWSTACK_SAVE		96
+
+#define BACKOFF			0x1000
+# Large excess allocated when calling non-split-stack code.
+#define NON_SPLIT_STACK		0x100000
+
+# TCB offset of __private_ss
+#define TCB_PRIVATE_SS		#16
+
+	.text
+ENTRY(__morestack_non_split)
+	.cfi_startproc
+# We use a cleanup to restore the tcbhead_t.__private_ss if
+# an exception is thrown through this code.
+	add	x10, x10, NON_SPLIT_STACK
+	.cfi_endproc
+END(__morestack_non_split)
+# Fall through into __morestack
+
+# This function is called with non-standard calling conventions, on entry
+# x10 is the requested stack pointer.  The split-stack prologue is in the
+# form:
+#
+# function:
+#	mrs    x9, tpidr_el0
+#	sub    x10, sp, N & 0xfff000
+#	sub    x10, x10, N & 0xfff
+#	ldr    x9, [x9, 16]
+#	adrp   x12, main_fn_entry
+#	add    x12, x12, :lo12:main_fn_entry
+#	[mov   x11, sp]                # If function has stacked arguments
+#	cmp    x9, x10
+#	b.lt   main_fn_entry
+#	b      __morestack
+# main_fn_entry: [x11 is argument pointer]
+#	[prolog]
+#
+# The normal function prologue follows here, with a small addition at the
+# end to set up the argument pointer if required (the prolog):
+#
+#       [...]                  # default function prologue
+#	b.lt   function:
+# function:
+#
+#
+# The N bit is also restored to indicate that the function is called
+# (so the prologue addition can set up the argument pointer correctly).
+
+ENTRY(__morestack)
+.LFB1:
+	.cfi_startproc
+
+#ifdef __PIC__
+	.cfi_personality 0x9b,DW.ref.__gcc_personality_v0
+	.cfi_lsda 0x1b,.LLSDA1
+#else
+	.cfi_personality 0x3,__gcc_personality_v0
+	.cfi_lsda 0x3,.LLSDA1
+#endif
+
+	# Calculate requested stack size.
+	sub	x10, sp, x10
+	# Save parameters
+	stp	x29, x30, [sp, -MORESTACK_FRAMESIZE]!
+	.cfi_def_cfa_offset MORESTACK_FRAMESIZE
+	.cfi_offset 29, -MORESTACK_FRAMESIZE
+	.cfi_offset 30, -MORESTACK_FRAMESIZE+8
+	add	x29, sp, 0
+	.cfi_def_cfa_register 29
+	# Adjust the requested stack size for the frame pointer save.
+	stp	x0, x1, [sp, 16]
+	stp	x2, x3, [sp, 32]
+	add	x10, x10, BACKOFF
+	stp	x4, x5, [sp, 48]
+	stp	x6, x7, [sp, 64]
+	stp 	x8, x12, [sp, 80]
+	str	x10, [sp, 96]
+
+	# void __morestack_block_signals (void)
+	bl	__morestack_block_signals
+
+	# void *__generic_morestack (size_t *pframe_size,
+	#			     void *old_stack,
+	#			     size_t param_size)
+	# pframe_size: is the size of the required stack frame (the function
+	#	       amount of space remaining on the allocated stack).
+	# old_stack: points at the parameters the old stack
+	# param_size: size in bytes of parameters to copy to the new stack.
+	add	x0, x29, NEWSTACK_SAVE
+	add	x1, x29, MORESTACK_FRAMESIZE
+	mov	x2, 0
+	bl	__generic_morestack
+
+	# Start using new stack
+	mov	sp, x0
+
+	# Set __private_ss stack guard for the new stack.
+	ldr	x9, [x29, NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, x9
+.LEHB0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	# void __morestack_unblock_signals (void)
+	bl	__morestack_unblock_signals
+
+	# Set up for a call to the target function.
+	ldp	x0, x1, [x29, 16]
+	ldp	x2, x3, [x29, 32]
+	ldp	x4, x5, [x29, 48]
+	ldp	x6, x7, [x29, 64]
+	ldp	x8, x12, [x29, 80]
+	add	x11, x29, MORESTACK_FRAMESIZE
+	# Indicate __morestack was called.
+	cmp	x12, 0
+	blr	x12
+
+	stp	x0, x1, [x29, 16]
+	stp	x2, x3, [x29, 32]
+	stp	x4, x5, [x29, 48]
+	stp	x6, x7, [x29, 64]
+
+	bl	__morestack_block_signals
+
+	# void *__generic_releasestack (size_t *pavailable)
+	add	x0, x29, NEWSTACK_SAVE
+	bl	__generic_releasestack
+
+	# Reset __private_ss stack guard to value for old stack
+	ldr	x9, [x29, NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, x9
+
+	# Update TCB split stack field
+.LEHE0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	bl __morestack_unblock_signals
+
+	# Use old stack again.
+	add	sp, x29, MORESTACK_FRAMESIZE
+
+	ldp	x0, x1, [x29, 16]
+	ldp	x2, x3, [x29, 32]
+	ldp	x4, x5, [x29, 48]
+	ldp	x6, x7, [x29, 64]
+	ldp	x29, x30, [x29]
+
+	.cfi_remember_state
+	.cfi_restore 30
+	.cfi_restore 29
+	.cfi_def_cfa 31, 0
+
+	ret
+
+# This is the cleanup code called by the stack unwinder when
+# unwinding through code between .LEHB0 and .LEHE0 above.
+cleanup:
+	.cfi_restore_state
+	# Reuse the new stack allocation to save/restore the
+	# exception header
+	str	x0, [x29, NEWSTACK_SAVE]
+	# size_t __generic_findstack (void *stack)
+	add	x0, x29, MORESTACK_FRAMESIZE
+	bl	__generic_findstack
+	sub	x0, x29, x0
+	add	x0, x0, BACKOFF
+	# Restore tcbhead_t.__private_ss
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+	ldr	x0, [x29, NEWSTACK_SAVE]
+	b	_Unwind_Resume
+        .cfi_endproc
+END(__morestack)
+
+	.section .gcc_except_table,"a",@progbits
+	.align 4
+.LLSDA1:
+	# @LPStart format (omit)
+        .byte   0xff
+	# @TType format (omit)
+        .byte   0xff
+	# Call-site format (uleb128)
+        .byte   0x1
+	# Call-site table length
+        .uleb128 .LLSDACSE1-.LLSDACSB1
+.LLSDACSB1:
+	# region 0 start
+        .uleb128 .LEHB0-.LFB1
+	# length
+        .uleb128 .LEHE0-.LEHB0
+	# landing pad
+        .uleb128 cleanup-.LFB1
+	# no action (ie a cleanup)
+        .uleb128 0
+.LLSDACSE1:
+
+
+	.global __gcc_personality_v0
+#ifdef __PIC__
+	# Build a position independent reference to the personality function.
+	.hidden DW.ref.__gcc_personality_v0
+	.weak   DW.ref.__gcc_personality_v0
+	.section .data.DW.ref.__gcc_personality_v0,"awG",@progbits,DW.ref.__gcc_personality_v0,comdat
+	.type   DW.ref.__gcc_personality_v0, @object
+	.align 3
+DW.ref.__gcc_personality_v0:
+	.size   DW.ref.__gcc_personality_v0, 8
+	.quad   __gcc_personality_v0
+#endif
+
+	.section .note.GNU-stack,"",@progbits
+	.section .note.GNU-split-stack,"",@progbits
+	.section .note.GNU-no-split-stack,"",@progbits
diff --git a/libgcc/config/aarch64/t-stack-aarch64 b/libgcc/config/aarch64/t-stack-aarch64
new file mode 100644
index 0000000..4babb4e
--- /dev/null
+++ b/libgcc/config/aarch64/t-stack-aarch64
@@ -0,0 +1,3 @@
+# Makefile fragment to support -fsplit-stack for aarch64.
+LIB2ADD_ST += $(srcdir)/config/aarch64/morestack.S \
+	      $(srcdir)/config/aarch64/morestack-c.c
diff --git a/libgcc/generic-morestack.c b/libgcc/generic-morestack.c
index adbe436..e91ceb4 100644
--- a/libgcc/generic-morestack.c
+++ b/libgcc/generic-morestack.c
@@ -943,6 +943,7 @@ __splitstack_find (void *segment_arg, void *sp, size_t *len,
       nsp -= 2 * 160;
 #elif defined __s390__
       nsp -= 2 * 96;
+#elif defined __aarch64__
 #else
 #error "unrecognized target"
 #endif
-- 
2.1.4


Jiong Wang Jan. 25, 2017, 12:10 p.m. UTC | #3
On 24/01/17 18:05, Adhemerval Zanella wrote:
>

> On 03/01/2017 13:13, Wilco Dijkstra wrote:

>

>> +  /* If function uses stacked arguments save the old stack value so morestack

>> +     can return it.  */

>> +  reg11 = gen_rtx_REG (Pmode, R11_REGNUM);

>> +  if (cfun->machine->frame.saved_regs_size

>> +      || cfun->machine->frame.saved_varargs_size)

>> +    emit_move_insn (reg11, stack_pointer_rtx);

>>

>> This doesn't look right - we could have many arguments even without varargs or

>> saved regs.  This would need to check varargs as well as ctrl->args.size (I believe

>> that is the size of the arguments on the stack). It's fine to omit this optimization

>> in the first version - we already emit 2-3 extra instructions for the check anyway.

> I will check for a better solution.


Hi Adhemerval

   My only concern on this this patch is the initialization of R11 (internal arg
pointer).  The current implementation looks to me is generating wrong code for a
testcase simply return the sum of ten int param, I see the function body is
using R11 while there is no initialization of it in split prologue,  so if the
execution flow is *not* through __morestack, then R11 is not initialized.
         
As Wilco suggested, I feel using crtl->args.size instead of
cfun->machine->frame.saved_regs_size might be the correct approach after
checking assign_parms in function.c.
Adhemerval Zanella Jan. 31, 2017, 7:57 p.m. UTC | #4
On 25/01/2017 10:10, Jiong Wang wrote:
> On 24/01/17 18:05, Adhemerval Zanella wrote:

>>

>> On 03/01/2017 13:13, Wilco Dijkstra wrote:

>>

>>> +  /* If function uses stacked arguments save the old stack value so morestack

>>> +     can return it.  */

>>> +  reg11 = gen_rtx_REG (Pmode, R11_REGNUM);

>>> +  if (cfun->machine->frame.saved_regs_size

>>> +      || cfun->machine->frame.saved_varargs_size)

>>> +    emit_move_insn (reg11, stack_pointer_rtx);

>>>

>>> This doesn't look right - we could have many arguments even without varargs or

>>> saved regs.  This would need to check varargs as well as ctrl->args.size (I believe

>>> that is the size of the arguments on the stack). It's fine to omit this optimization

>>> in the first version - we already emit 2-3 extra instructions for the check anyway.

>> I will check for a better solution.

> 

> Hi Adhemerval

> 

>   My only concern on this this patch is the initialization of R11 (internal arg

> pointer).  The current implementation looks to me is generating wrong code for a

> testcase simply return the sum of ten int param, I see the function body is

> using R11 while there is no initialization of it in split prologue,  so if the

> execution flow is *not* through __morestack, then R11 is not initialized.

>         As Wilco suggested, I feel using crtl->args.size instead of

> cfun->machine->frame.saved_regs_size might be the correct approach after

> checking assign_parms in function.c.

> 


Hi Jiong,

Indeed the previous version which used 'saved_regs_size' is wrong for stacked
parameters.  A simple 10 arguments function call shows when the reg11 is
evaluated:

cfun->machine->frame.saved_regs_size    = 0
cfun->machine->frame.saved_varargs_size = 0
crtl->args.size                         = 16

So indeed 'ctrl->args.size' seems the correct argument to used in this case
(which will trigger the correct reg11 set for split-stack case).  In this version
I also removed some unused variables I left in previous patch.From 30cbedc303d364dd94f0d35abee1d237a3671cdb Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>

Date: Wed, 4 May 2016 21:13:39 +0000
Subject: [PATCH] aarch64: Add split-stack initial support

This patch adds the split-stack support on aarch64 (PR #67877).  As for
other ports this patch should be used along with glibc and gold support.

The support is done similar to other architectures: a __private_ss field is
added on TCB in glibc, a target-specific __morestack implementation and
helper functions are added in libgcc and compiler supported in adjusted
(split-stack prologue, va_start for argument handling).  I also plan to
send the gold support to adjust stack allocation acrosss split-stack
and default code calls.

Current approach is similar to powerpc one: at most 2 GB of stack allocation
is support so stack adjustments can be done with 2 instructions (either just
a movn plus nop or a movn followed by movk).  The morestack call is non
standard with x10 hollding the requested stack pointer, x11 the argument
pointer, and x12 to return continuation address.  Unwinding is handled by a
personality routine that knows how to find stack segments.

Split-stack prologue on function entry is as follow (this goes before the
usual function prologue):

function:
	mrs    x9, tpidr_el0
	mov    x10, -<required stack allocation>
	movk   0x0
	add    x10, sp, x10
	mov    x11, sp   	# if function has stacked arguments
	adrp   x12, main_fn_entry
	add    x12, x12, :lo12:.L2
	cmp    x9, x10
	b.lt   <main_fn_entry>
	b      __morestack
main_fn_entry:
	[function prologue]

Notes:

1. Even if a function does not allocate a stack frame, a split-stack prologue
   is created.  It is to avoid issues with tail call for external symbols
   which might require linker adjustment (libgo/runtime/go-varargs.c).

2. Basic-block reordering (enabled with -O2) will move split-stack TCB ldr
   to after the required stack calculation.

3. Similar to powerpc, When the linker detects a call from split-stack to
   non-split-stack code, it adds 16k (or more) to the value found in "allocate"
   instructions (so non-split-stack code gets a larger stack).  The amount is
   tunable by a linker option.  The edit means aarch64 does not need to
   implement __morestack_non_split, necessary on x86 because insufficient
   space is available there to edit the stack comparison code.  This feature
   is only implemented in the GNU gold linker.

4. AArch64 does not handle >4G stack initially and although it is possible
   to implement it, limiting to 4G allows to materize the allocation with
   only 2 instructions (mov + movk) and thus simplifying the linker
   adjustments required.  Supporting multiple threads each requiring more
   than 4G of stack is probably not that important, and likely to OOM at
   run time.

5. The TCB support on GLIBC is meant to be included in version 2.25.

6. The continuation address materialized on x12 is done using 'adrp'
   plus add and a static relocation.  Current code uses the
   aarch64_expand_mov_immediate function and since a better alternative
   would be 'adp', it could be a future optimization (not implemented
   in this patch).

libgcc/ChangeLog:

	* libgcc/config.host: Use t-stack and t-statck-aarch64 for
	aarch64*-*-linux.
	* libgcc/config/aarch64/morestack-c.c: New file.
	* libgcc/config/aarch64/morestack.S: Likewise.
	* libgcc/config/aarch64/t-stack-aarch64: Likewise.
	* libgcc/generic-morestack.c (__splitstack_find): Add aarch64-specific
	code.

gcc/ChangeLog:

	* common/config/aarch64/aarch64-common.c
	(aarch64_supports_split_stack): New function.
	(TARGET_SUPPORTS_SPLIT_STACK): New macro.
	* gcc/config/aarch64/aarch64-linux.h (TARGET_ASM_FILE_END): Remove
	macro.
	* gcc/config/aarch64/aarch64-protos.h: Add
	aarch64_expand_split_stack_prologue and
	aarch64_split_stack_space_check.
	* gcc/config/aarch64/aarch64.c (aarch64_expand_prologue): Setup the
	argument pointer (x10) for split-stack.
	(aarch64_expand_builtin_va_start): Use internal argument pointer
	instead of virtual_incoming_args_rtx.
	(morestack_ref): New symbol.
	(aarch64_expand_split_stack_prologue): New function.
	(aarch64_file_end): Emit the split-stack note sections.
	(aarch64_internal_arg_pointer): Likewise.
	(aarch64_live_on_entry): Set the argument pointer for split-stack.
	(aarch64_split_stack_space_check): Likewise.
	(TARGET_ASM_FILE_END): New macro.
	(TARGET_EXTRA_LIVE_ON_ENTRY): Likewise.
	(TARGET_INTERNAL_ARG_POINTER): Likewise.
	* gcc/config/aarch64/aarch64.h (aarch64_frame): Add
	split_stack_arg_pointer to setup the argument pointer when using
	split-stack.
	* gcc/config/aarch64/aarch64.md (UNSPEC_STACK_CHECK): New unspec.
	(UNSPECV_SPLIT_STACK_RETURN): Likewise.
	(split_stack_prologue): New expand.
	(split_stack_space_check): Likewise.
	(split_stack_cond_call): New expand.
---
 gcc/common/config/aarch64/aarch64-common.c |  16 +-
 gcc/config/aarch64/aarch64-linux.h         |   2 -
 gcc/config/aarch64/aarch64-protos.h        |   2 +
 gcc/config/aarch64/aarch64.c               | 173 +++++++++++++++++++-
 gcc/config/aarch64/aarch64.h               |   3 +
 gcc/config/aarch64/aarch64.md              |  57 +++++++
 libgcc/config.host                         |   1 +
 libgcc/config/aarch64/morestack-c.c        |  95 +++++++++++
 libgcc/config/aarch64/morestack.S          | 254 +++++++++++++++++++++++++++++
 libgcc/config/aarch64/t-stack-aarch64      |   3 +
 libgcc/generic-morestack.c                 |   1 +
 11 files changed, 602 insertions(+), 5 deletions(-)
 create mode 100644 libgcc/config/aarch64/morestack-c.c
 create mode 100644 libgcc/config/aarch64/morestack.S
 create mode 100644 libgcc/config/aarch64/t-stack-aarch64

diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
index a0b7f48..286f0c6 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -107,6 +107,21 @@ aarch64_handle_option (struct gcc_options *opts,
     }
 }
 
+/* -fsplit-stack uses a TCB field available on glibc-2.25.  GLIBC also
+   exports symbol, __tcb_private_ss, to signal it has the field available
+   on TCB allocation.  This aims to prevent binaries linked against newer
+   GLIBC to run on non-supported ones.  */
+
+static bool
+aarch64_supports_split_stack (bool report ATTRIBUTE_UNUSED,
+			      struct gcc_options *opts ATTRIBUTE_UNUSED)
+{
+  return true;
+}
+
+#undef TARGET_SUPPORTS_SPLIT_STACK
+#define TARGET_SUPPORTS_SPLIT_STACK aarch64_supports_split_stack
+
 struct gcc_targetm_common targetm_common = TARGETM_COMMON_INITIALIZER;
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -340,4 +355,3 @@ aarch64_rewrite_mcpu (int argc, const char **argv)
 }
 
 #undef AARCH64_CPU_NAME_LENGTH
-
diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
index c45fc1d..b8daba4 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -80,8 +80,6 @@
     }						\
   while (0)
 
-#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
-
 /* Uninitialized common symbols in non-PIE executables, even with
    strong definitions in dependent shared libraries, will resolve
    to COPY relocated symbol in the executable.  See PR65780.  */
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f55d4ba..bfb8b51 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -377,6 +377,8 @@ void aarch64_err_no_fpadvsimd (machine_mode, const char *);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx);
 void aarch64_expand_prologue (void);
+void aarch64_expand_split_stack_prologue (void);
+void aarch64_split_stack_space_check (rtx, rtx);
 void aarch64_expand_vector_init (rtx, rtx);
 void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
 				   const_tree, unsigned);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f343d92..1d86bfa 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -901,7 +901,12 @@ aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
     output_asm_insn (buffer, operands);
 
-    snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
+    if (GET_CODE (operands[pos_label]) == LABEL_REF)
+      snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label,
+		label_ptr);
+    else
+      snprintf (buffer, sizeof (buffer), "b\t%%%d\n%s:", pos_label,
+		label_ptr);
     operands[pos_label] = dest_label;
     output_asm_insn (buffer, operands);
     return "";
@@ -10122,7 +10127,7 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
   /* Emit code to initialize STACK, which points to the next varargs stack
      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
      by named arguments.  STACK is 8-byte aligned.  */
-  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
+  t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
   if (cum->aapcs_stack_size > 0)
     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
@@ -14680,6 +14685,164 @@ aarch64_excess_precision (enum excess_precision_type type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* -fsplit-stack support.  */
+
+/* A SYMBOL_REF for __morestack.  */
+static GTY(()) rtx morestack_ref;
+
+/* Emit -fsplit-stack prologue, which goes before the regular function
+   prologue.  */
+
+void
+aarch64_expand_split_stack_prologue (void)
+{
+  rtx ssvalue, mem;
+  rtx reg10, reg11, reg12, cc, cmp, jump;
+  HOST_WIDE_INT allocate;
+  rtx_code_label *ok_label = NULL;
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  gcc_assert (flag_split_stack && reload_completed);
+
+  /* It limits total maximum stack allocation on 4G so its value can be
+     materialized using two instructions at most (movn/movk).  It might be
+     used by the linker to add some extra space for split calling non split
+     stack functions.  */
+  allocate = cfun->machine->frame.frame_size;
+  if (allocate > ((int64_t)1 << 32))
+    {
+      sorry ("Stack frame larger than 4G is not supported for -fsplit-stack");
+      return;
+    }
+
+  if (morestack_ref == NULL_RTX)
+    {
+      morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+      SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
+					   | SYMBOL_FLAG_FUNCTION);
+    }
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  /* Always emit two insns to calculate the requested stack, so the linker
+     can edit them when adjusting size for calling non-split-stack code.  */
+  reg10 = gen_rtx_REG (Pmode, R10_REGNUM);
+  int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate),
+					      true, Pmode);
+  gcc_assert (ninsn == 1 || ninsn == 2);
+  if (ninsn == 1)
+    {
+      if (allocate > 0)
+	emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0xffff0000)));
+      else
+	emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0x0)));
+    }
+  emit_insn (gen_add3_insn (reg10, stack_pointer_rtx, reg10));
+
+  ok_label = gen_label_rtx ();
+
+  /* If function uses stacked arguments save the old stack value so morestack
+     can return it.  */
+  reg11 = gen_rtx_REG (Pmode, R11_REGNUM);
+  if (crtl->args.size
+      || cfun->machine->frame.saved_varargs_size)
+    emit_move_insn (reg11, stack_pointer_rtx);
+
+  /* x12 holds the continuation address used to return to function.  */
+  reg12 = gen_rtx_REG (Pmode, R12_REGNUM);
+  aarch64_expand_mov_immediate (reg12, gen_rtx_LABEL_REF (VOIDmode, ok_label));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  cc = aarch64_gen_compare_reg (GE, ssvalue, reg10);
+  cmp = gen_rtx_fmt_ee (GE, VOIDmode, cc, const0_rtx);
+  jump = gen_split_stack_cond_call (morestack_ref, cmp, ok_label, reg12);
+
+  aarch64_emit_unlikely_jump (jump);
+  JUMP_LABEL (jump) = ok_label;
+  LABEL_NUSES (ok_label)++;
+
+  /* __morestack will call us here.  */
+  emit_label (ok_label);
+}
+
+/* Implement TARGET_ASM_FILE_END.  */
+
+static void
+aarch64_file_end (void)
+{
+  file_end_indicate_exec_stack ();
+
+  if (flag_split_stack)
+    file_end_indicate_split_stack ();
+}
+
+/* Return the internal arg pointer used for function incoming arguments.  */
+
+static rtx
+aarch64_internal_arg_pointer (void)
+{
+  if (flag_split_stack
+     && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
+         == NULL))
+    {
+      if (cfun->machine->frame.split_stack_arg_pointer == NULL_RTX)
+	{
+	  rtx pat;
+
+	  cfun->machine->frame.split_stack_arg_pointer = gen_reg_rtx (Pmode);
+	  REG_POINTER (cfun->machine->frame.split_stack_arg_pointer) = 1;
+
+	  /* Put the pseudo initialization right after the note at the
+	     beginning of the function.  */
+	  pat = gen_rtx_SET (cfun->machine->frame.split_stack_arg_pointer,
+			     gen_rtx_REG (Pmode, R11_REGNUM));
+	  push_topmost_sequence ();
+	  emit_insn_after (pat, get_insns ());
+	  pop_topmost_sequence ();
+	}
+      return plus_constant (Pmode, cfun->machine->frame.split_stack_arg_pointer,
+			    FIRST_PARM_OFFSET (current_function_decl));
+    }
+  return virtual_incoming_args_rtx;
+}
+
+/* Emit -fsplit-stack dynamic stack allocation space check.  */
+
+void
+aarch64_split_stack_space_check (rtx size, rtx label)
+{
+  rtx mem, ssvalue, cc, cmp, jump;
+  rtx reg10, reg12;
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_reg_rtx (Pmode);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  /* And compare it with frame pointer plus required stack.  */
+  reg10 = gen_rtx_REG (Pmode, R10_REGNUM);
+  size = force_reg (Pmode, size);
+  emit_move_insn (reg10, gen_rtx_MINUS (Pmode, stack_pointer_rtx, size));
+
+  /* x12 holds the continuation address used to return to function.  */
+  reg12 = gen_rtx_REG (Pmode, R12_REGNUM);
+  aarch64_expand_mov_immediate (reg12, gen_rtx_LABEL_REF (VOIDmode, label));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  cc = aarch64_gen_compare_reg (GE, ssvalue, reg10);
+  cmp = gen_rtx_fmt_ee (GE, VOIDmode, cc, const0_rtx);
+  jump = emit_jump_insn (gen_condjump (cmp, cc, label));
+  JUMP_LABEL (jump) = label;
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -14752,6 +14915,9 @@ aarch64_run_selftests (void)
 #undef TARGET_ASM_FILE_START
 #define TARGET_ASM_FILE_START aarch64_start_file
 
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END aarch64_file_end
+
 #undef TARGET_ASM_OUTPUT_MI_THUNK
 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
 
@@ -14837,6 +15003,9 @@ aarch64_run_selftests (void)
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
 
+#undef TARGET_INTERNAL_ARG_POINTER
+#define TARGET_INTERNAL_ARG_POINTER aarch64_internal_arg_pointer
+
 #undef TARGET_GIMPLE_FOLD_BUILTIN
 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index e4fb96f..510f60a 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -594,6 +594,9 @@ struct GTY (()) aarch64_frame
   unsigned wb_candidate2;
 
   bool laid_out;
+
+  /* Alternative internal arg pointer for -fsplit-stack.  */
+  rtx split_stack_arg_pointer;
 };
 
 typedef struct GTY (()) machine_function
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a693a3b..d08b5fc 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -135,6 +135,7 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_STACK_CHECK
     UNSPEC_RSQRT
     UNSPEC_RSQRTE
     UNSPEC_RSQRTS
@@ -150,6 +151,7 @@
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
     UNSPECV_BLOCKAGE		; Represent a blockage
     UNSPECV_PROBE_STACK_RANGE	; Represent stack range probing.
+    UNSPECV_SPLIT_STACK_CALL    ; Represent a morestack call
   ]
 )
 
@@ -5538,3 +5540,58 @@
 
 ;; ldp/stp peephole patterns
 (include "aarch64-ldpstp.md")
+
+;; Handle -fsplit-stack
+(define_expand "split_stack_prologue"
+  [(const_int 0)]
+  ""
+{
+  aarch64_expand_split_stack_prologue ();
+  DONE;
+})
+
+;; If there are operand 0 bytes available on the stack, jump to
+;; operand 1.
+(define_expand "split_stack_space_check"
+  [(set (match_dup 2) (compare:CC (match_dup 3) (match_dup 2)))
+   (set (pc) (if_then_else
+	      (geu (match_dup 4) (const_int 0))
+	      (label_ref (match_operand 1))
+	      (pc)))]
+  ""
+{
+  aarch64_split_stack_space_check (operands[0], operands[1]);
+  DONE;
+})
+
+;; A __morestack call using branch
+
+(define_expand "split_stack_cond_call"
+  [(match_operand 0 "aarch64_call_insn_operand" "")
+   (match_operand 1 "" "")
+   (match_operand 2 "" "")
+   (match_operand 3 "" "")]
+  ""
+{
+  emit_jump_insn (gen_split_stack_cond_call_di (operands[0], operands[1],
+						operands[2], operands[3]));
+  DONE;
+})
+
+
+(define_insn "split_stack_cond_call_<mode>"
+  [(set (pc)
+        (if_then_else
+          (match_operand 1 "aarch64_comparison_operator" "")
+          (label_ref (match_operand 2 "" ""))
+          (pc)))
+   (set (reg:P 1) (unspec_volatile:P [(match_operand:P 0 "aarch64_call_insn_operand" "")
+                                    (reg:P 1)]
+                                   UNSPECV_SPLIT_STACK_CALL))
+   (use (match_operand:P 3 "register_operand" ""))]
+  ""
+  {
+    return aarch64_gen_far_branch (operands, 0, "Lbcond", "b%M1\\t");
+  }
+  [(set_attr "type" "branch")]
+)
diff --git a/libgcc/config.host b/libgcc/config.host
index 540bfa9..ef2bd84 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -344,6 +344,7 @@ aarch64*-*-linux*)
 	md_unwind_header=aarch64/linux-unwind.h
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+	tmake_file="${tmake_file} t-stack aarch64/t-stack-aarch64"
 	;;
 alpha*-*-linux*)
 	tmake_file="${tmake_file} alpha/t-alpha alpha/t-ieee t-crtfm alpha/t-linux"
diff --git a/libgcc/config/aarch64/morestack-c.c b/libgcc/config/aarch64/morestack-c.c
new file mode 100644
index 0000000..8df7895
--- /dev/null
+++ b/libgcc/config/aarch64/morestack-c.c
@@ -0,0 +1,95 @@
+/* AArch64 support for -fsplit-stack.
+ * Copyright (C) 2016 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef inhibit_libc
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include "generic-morestack.h"
+
+/* This is based on GLIBC definition (version 2.24).  There is no need to
+   keep it sync since new fields are added on the end of structure and do
+   not change the '__private_ss' layout.  */
+typedef struct
+{
+  void *dtv;
+  void *private;
+  void *__private_ss;
+} tcbhead_t;
+
+#define INITIAL_STACK_SIZE  0x4000
+#define BACKOFF             0x1000
+
+void __generic_morestack_set_initial_sp (void *sp, size_t len);
+void *__morestack_get_guard (void);
+void __morestack_set_guard (void *);
+void *__morestack_make_guard (void *stack, size_t size);
+void __morestack_load_mmap (void);
+
+/* We declare is as weak so it fails either at stack linking or
+   at runtime if the GLIBC does not have the required TCB field.  */
+extern void __tcb_private_ss (void) __attribute__ ((weak));
+
+/* Initialize the stack guard when the program starts or when a new
+   thread.  This is called from a constructor using ctors section.  */
+void
+__stack_split_initialize (void)
+{
+  __tcb_private_ss ();
+
+  register void* sp __asm__ ("sp");
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = (void*)((uintptr_t)sp - INITIAL_STACK_SIZE);
+  return __generic_morestack_set_initial_sp (sp, INITIAL_STACK_SIZE);
+}
+
+/* Return current __private_ss.  */
+void *
+__morestack_get_guard (void)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  return tcb->__private_ss;
+}
+
+/* Set __private_ss to ptr.  */
+void
+__morestack_set_guard (void *ptr)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = ptr;
+}
+
+/* Return the stack guard value for given stack.  */
+void *
+__morestack_make_guard (void *stack, size_t size)
+{
+  return (void*)((uintptr_t)stack - size + BACKOFF);
+}
+
+/* Make __stack_split_initialize a high priority constructor.  */
+static void (*const ctors []) 
+  __attribute__ ((used, section (".ctors.65535"), aligned (sizeof (void *))))
+  = { __stack_split_initialize, __morestack_load_mmap };
+
+#endif /* !defined (inhibit_libc) */
diff --git a/libgcc/config/aarch64/morestack.S b/libgcc/config/aarch64/morestack.S
new file mode 100644
index 0000000..aac488d
--- /dev/null
+++ b/libgcc/config/aarch64/morestack.S
@@ -0,0 +1,254 @@
+# AArch64 support for -fsplit-stack.
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This file is part of GCC.
+
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+/* Define an entry point visible from C.  */
+#define ENTRY(name)						\
+  .globl name;							\
+  .type name,%function;						\
+  .align 4;							\
+  name##:
+
+#define END(name)						\
+  .size name,.-name
+
+/* __morestack frame size.  */
+#define MORESTACK_FRAMESIZE	112
+/* Offset from __morestack frame where the new stack size is saved and
+   passed to __generic_morestack.  */
+#define NEWSTACK_SAVE		96
+
+#define BACKOFF			0x1000
+# Large excess allocated when calling non-split-stack code.
+#define NON_SPLIT_STACK		0x100000
+
+# TCB offset of __private_ss
+#define TCB_PRIVATE_SS		#16
+
+	.text
+ENTRY(__morestack_non_split)
+	.cfi_startproc
+# We use a cleanup to restore the tcbhead_t.__private_ss if
+# an exception is thrown through this code.
+	add	x10, x10, NON_SPLIT_STACK
+	.cfi_endproc
+END(__morestack_non_split)
+# Fall through into __morestack
+
+# This function is called with non-standard calling conventions, on entry
+# x10 is the requested stack pointer.  The split-stack prologue is in the
+# form:
+#
+# function:
+#	mrs    x9, tpidr_el0
+#	sub    x10, sp, N & 0xfff000
+#	sub    x10, x10, N & 0xfff
+#	ldr    x9, [x9, 16]
+#	adrp   x12, main_fn_entry
+#	add    x12, x12, :lo12:main_fn_entry
+#	[mov   x11, sp]                # If function has stacked arguments
+#	cmp    x9, x10
+#	b.lt   main_fn_entry
+#	b      __morestack
+# main_fn_entry: [x11 is argument pointer]
+#	[prolog]
+#
+# The normal function prologue follows here, with a small addition at the
+# end to set up the argument pointer if required (the prolog):
+#
+#       [...]                  # default function prologue
+#	b.lt   function:
+# function:
+#
+#
+# The N bit is also restored to indicate that the function is called
+# (so the prologue addition can set up the argument pointer correctly).
+
+ENTRY(__morestack)
+.LFB1:
+	.cfi_startproc
+
+#ifdef __PIC__
+	.cfi_personality 0x9b,DW.ref.__gcc_personality_v0
+	.cfi_lsda 0x1b,.LLSDA1
+#else
+	.cfi_personality 0x3,__gcc_personality_v0
+	.cfi_lsda 0x3,.LLSDA1
+#endif
+
+	# Calculate requested stack size.
+	sub	x10, sp, x10
+	# Save parameters
+	stp	x29, x30, [sp, -MORESTACK_FRAMESIZE]!
+	.cfi_def_cfa_offset MORESTACK_FRAMESIZE
+	.cfi_offset 29, -MORESTACK_FRAMESIZE
+	.cfi_offset 30, -MORESTACK_FRAMESIZE+8
+	add	x29, sp, 0
+	.cfi_def_cfa_register 29
+	# Adjust the requested stack size for the frame pointer save.
+	stp	x0, x1, [sp, 16]
+	stp	x2, x3, [sp, 32]
+	add	x10, x10, BACKOFF
+	stp	x4, x5, [sp, 48]
+	stp	x6, x7, [sp, 64]
+	stp 	x8, x12, [sp, 80]
+	str	x10, [sp, 96]
+
+	# void __morestack_block_signals (void)
+	bl	__morestack_block_signals
+
+	# void *__generic_morestack (size_t *pframe_size,
+	#			     void *old_stack,
+	#			     size_t param_size)
+	# pframe_size: is the size of the required stack frame (the function
+	#	       amount of space remaining on the allocated stack).
+	# old_stack: points at the parameters the old stack
+	# param_size: size in bytes of parameters to copy to the new stack.
+	add	x0, x29, NEWSTACK_SAVE
+	add	x1, x29, MORESTACK_FRAMESIZE
+	mov	x2, 0
+	bl	__generic_morestack
+
+	# Start using new stack
+	mov	sp, x0
+
+	# Set __private_ss stack guard for the new stack.
+	ldr	x9, [x29, NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, x9
+.LEHB0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	# void __morestack_unblock_signals (void)
+	bl	__morestack_unblock_signals
+
+	# Set up for a call to the target function.
+	ldp	x0, x1, [x29, 16]
+	ldp	x2, x3, [x29, 32]
+	ldp	x4, x5, [x29, 48]
+	ldp	x6, x7, [x29, 64]
+	ldp	x8, x12, [x29, 80]
+	add	x11, x29, MORESTACK_FRAMESIZE
+	# Indicate __morestack was called.
+	cmp	x12, 0
+	blr	x12
+
+	stp	x0, x1, [x29, 16]
+	stp	x2, x3, [x29, 32]
+	stp	x4, x5, [x29, 48]
+	stp	x6, x7, [x29, 64]
+
+	bl	__morestack_block_signals
+
+	# void *__generic_releasestack (size_t *pavailable)
+	add	x0, x29, NEWSTACK_SAVE
+	bl	__generic_releasestack
+
+	# Reset __private_ss stack guard to value for old stack
+	ldr	x9, [x29, NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, x9
+
+	# Update TCB split stack field
+.LEHE0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	bl __morestack_unblock_signals
+
+	# Use old stack again.
+	add	sp, x29, MORESTACK_FRAMESIZE
+
+	ldp	x0, x1, [x29, 16]
+	ldp	x2, x3, [x29, 32]
+	ldp	x4, x5, [x29, 48]
+	ldp	x6, x7, [x29, 64]
+	ldp	x29, x30, [x29]
+
+	.cfi_remember_state
+	.cfi_restore 30
+	.cfi_restore 29
+	.cfi_def_cfa 31, 0
+
+	ret
+
+# This is the cleanup code called by the stack unwinder when
+# unwinding through code between .LEHB0 and .LEHE0 above.
+cleanup:
+	.cfi_restore_state
+	# Reuse the new stack allocation to save/restore the
+	# exception header
+	str	x0, [x29, NEWSTACK_SAVE]
+	# size_t __generic_findstack (void *stack)
+	add	x0, x29, MORESTACK_FRAMESIZE
+	bl	__generic_findstack
+	sub	x0, x29, x0
+	add	x0, x0, BACKOFF
+	# Restore tcbhead_t.__private_ss
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+	ldr	x0, [x29, NEWSTACK_SAVE]
+	b	_Unwind_Resume
+        .cfi_endproc
+END(__morestack)
+
+	.section .gcc_except_table,"a",@progbits
+	.align 4
+.LLSDA1:
+	# @LPStart format (omit)
+        .byte   0xff
+	# @TType format (omit)
+        .byte   0xff
+	# Call-site format (uleb128)
+        .byte   0x1
+	# Call-site table length
+        .uleb128 .LLSDACSE1-.LLSDACSB1
+.LLSDACSB1:
+	# region 0 start
+        .uleb128 .LEHB0-.LFB1
+	# length
+        .uleb128 .LEHE0-.LEHB0
+	# landing pad
+        .uleb128 cleanup-.LFB1
+	# no action (ie a cleanup)
+        .uleb128 0
+.LLSDACSE1:
+
+
+	.global __gcc_personality_v0
+#ifdef __PIC__
+	# Build a position independent reference to the personality function.
+	.hidden DW.ref.__gcc_personality_v0
+	.weak   DW.ref.__gcc_personality_v0
+	.section .data.DW.ref.__gcc_personality_v0,"awG",@progbits,DW.ref.__gcc_personality_v0,comdat
+	.type   DW.ref.__gcc_personality_v0, @object
+	.align 3
+DW.ref.__gcc_personality_v0:
+	.size   DW.ref.__gcc_personality_v0, 8
+	.quad   __gcc_personality_v0
+#endif
+
+	.section .note.GNU-stack,"",@progbits
+	.section .note.GNU-split-stack,"",@progbits
+	.section .note.GNU-no-split-stack,"",@progbits
diff --git a/libgcc/config/aarch64/t-stack-aarch64 b/libgcc/config/aarch64/t-stack-aarch64
new file mode 100644
index 0000000..4babb4e
--- /dev/null
+++ b/libgcc/config/aarch64/t-stack-aarch64
@@ -0,0 +1,3 @@
+# Makefile fragment to support -fsplit-stack for aarch64.
+LIB2ADD_ST += $(srcdir)/config/aarch64/morestack.S \
+	      $(srcdir)/config/aarch64/morestack-c.c
diff --git a/libgcc/generic-morestack.c b/libgcc/generic-morestack.c
index adbe436..e91ceb4 100644
--- a/libgcc/generic-morestack.c
+++ b/libgcc/generic-morestack.c
@@ -943,6 +943,7 @@ __splitstack_find (void *segment_arg, void *sp, size_t *len,
       nsp -= 2 * 160;
 #elif defined __s390__
       nsp -= 2 * 96;
+#elif defined __aarch64__
 #else
 #error "unrecognized target"
 #endif
-- 
2.1.4


Wilco Dijkstra Feb. 1, 2017, 2:31 p.m. UTC | #5
Hi Adhermerval,

The argument code looks good now, but this isn't right:

+  int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate),
+                                             true, Pmode);
+  gcc_assert (ninsn == 1 || ninsn == 2);
+  if (ninsn == 1)
+    {
+      if (allocate > 0)
+       emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0xffff0000)));
+      else
+       emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0x0)));
+    }

Both insv_imm will always set the low 16 bits of X10 to zero, corrupting the value
of the first instruction. It seems best to emit both instructions explicitly and use
positive values to avoid the zero special case (this should make the linker code
updating the allocation simpler too):

gen_rtx_SET (reg10, GEN_INT (allocate & 0xffff))
gen_insv_immdi (reg10, GEN_INT (16), GEN_INT ((allocate & 0xffff0000) >> 16))

I bet this will avoid the crash you mentioned.

Wilco
diff mbox

Patch

From 5c5bbd7981317c5fc07fcb82a2e831cb6cc1d7be Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Wed, 4 May 2016 21:13:39 +0000
Subject: [PATCH] aarch64: Add split-stack initial support

This patch adds the split-stack support on aarch64 (PR #67877).  As for
other ports this patch should be used along with glibc and gold support.

The support is done similar to other architectures: a __private_ss field is
added on TCB in glibc, a target-specific __morestack implementation and
helper functions are added in libgcc and compiler supported in adjusted
(split-stack prologue, va_start for argument handling).  I also plan to
send the gold support to adjust stack allocation acrosss split-stack
and default code calls.

Current approach is similar to powerpc one: at most 2 GB of stack allocation
is support so stack adjustments can be done with 2 instructions (either just
a movn plus nop or a movn followed by movk).  The morestack call is non
standard with x10 hollding the requested stack pointer, x11 the argument
pointer, and x12 to return continuation address.  Unwinding is handled by a
personality routine that knows how to find stack segments.

Split-stack prologue on function entry is as follow (this goes before the
usual function prologue):

function:
	mrs    x9, tpidr_el0
	mov    x10, -<required stack allocation>
	movk   0x0
	add    x10, sp, x10
	mov    x11, sp   	# if function has stacked arguments
	adrp   x12, main_fn_entry
	add    x12, x12, :lo12:.L2
	cmp    x9, x10
	b.lt   <main_fn_entry>
	b      __morestack
main_fn_entry:
	[function prologue]

Notes:

1. Even if a function does not allocate a stack frame, a split-stack prologue
   is created.  It is to avoid issues with tail call for external symbols
   which might require linker adjustment (libgo/runtime/go-varargs.c).

2. Basic-block reordering (enabled with -O2) will move split-stack TCB ldr
   to after the required stack calculation.

3. Similar to powerpc, When the linker detects a call from split-stack to
   non-split-stack code, it adds 16k (or more) to the value found in "allocate"
   instructions (so non-split-stack code gets a larger stack).  The amount is
   tunable by a linker option.  The edit means aarch64 does not need to
   implement __morestack_non_split, necessary on x86 because insufficient
   space is available there to edit the stack comparison code.  This feature
   is only implemented in the GNU gold linker.

4. AArch64 does not handle >4G stack initially and although it is possible
   to implement it, limiting to 4G allows to materize the allocation with
   only 2 instructions (mov + movk) and thus simplifying the linker
   adjustments required.  Supporting multiple threads each requiring more
   than 4G of stack is probably not that important, and likely to OOM at
   run time.

5. The TCB support on GLIBC is meant to be included in version 2.25.

6. The continuation address materialized on x12 is done using 'adrp'
   plus add and a static relocation.  Current code uses the
   aarch64_expand_mov_immediate function and since a better alternative
   would be 'adp', it could be a future optimization (not implemented
   in this patch).

libgcc/ChangeLog:

	* libgcc/config.host: Use t-stack and t-statck-aarch64 for
	aarch64*-*-linux.
	* libgcc/config/aarch64/morestack-c.c: New file.
	* libgcc/config/aarch64/morestack.S: Likewise.
	* libgcc/config/aarch64/t-stack-aarch64: Likewise.
	* libgcc/generic-morestack.c (__splitstack_find): Add aarch64-specific
	code.

gcc/ChangeLog:

	* common/config/aarch64/aarch64-common.c
	(aarch64_supports_split_stack): New function.
	(TARGET_SUPPORTS_SPLIT_STACK): New macro.
	* gcc/config/aarch64/aarch64-linux.h (TARGET_ASM_FILE_END): Remove
	macro.
	* gcc/config/aarch64/aarch64-protos.h: Add
	aarch64_expand_split_stack_prologue and
	aarch64_split_stack_space_check.
	* gcc/config/aarch64/aarch64.c (aarch64_expand_prologue): Setup the
	argument pointer (x10) for split-stack.
	(aarch64_expand_builtin_va_start): Use internal argument pointer
	instead of virtual_incoming_args_rtx.
	(morestack_ref): New symbol.
	(aarch64_expand_split_stack_prologue): New function.
	(aarch64_file_end): Emit the split-stack note sections.
	(aarch64_internal_arg_pointer): Likewise.
	(aarch64_live_on_entry): Set the argument pointer for split-stack.
	(aarch64_split_stack_space_check): Likewise.
	(TARGET_ASM_FILE_END): New macro.
	(TARGET_EXTRA_LIVE_ON_ENTRY): Likewise.
	(TARGET_INTERNAL_ARG_POINTER): Likewise.
	* gcc/config/aarch64/aarch64.h (aarch64_frame): Add
	split_stack_arg_pointer to setup the argument pointer when using
	split-stack.
	* gcc/config/aarch64/aarch64.md (UNSPEC_STACK_CHECK): New unspec.
	(UNSPECV_SPLIT_STACK_RETURN): Likewise.
	(split_stack_prologue): New expand.
	(split_stack_space_check): Likewise.
	(split_stack_cond_call): New expand.
---
 gcc/common/config/aarch64/aarch64-common.c |  16 +-
 gcc/config/aarch64/aarch64-linux.h         |   2 -
 gcc/config/aarch64/aarch64-protos.h        |   2 +
 gcc/config/aarch64/aarch64.c               | 171 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.h               |   3 +
 gcc/config/aarch64/aarch64.md              |  57 +++++++
 libgcc/config.host                         |   1 +
 libgcc/config/aarch64/morestack-c.c        |  95 +++++++++++
 libgcc/config/aarch64/morestack.S          | 254 +++++++++++++++++++++++++++++
 libgcc/config/aarch64/t-stack-aarch64      |   3 +
 libgcc/generic-morestack.c                 |   1 +
 11 files changed, 600 insertions(+), 5 deletions(-)
 create mode 100644 libgcc/config/aarch64/morestack-c.c
 create mode 100644 libgcc/config/aarch64/morestack.S
 create mode 100644 libgcc/config/aarch64/t-stack-aarch64

diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
index 90f5f6b..7641279 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -107,6 +107,21 @@  aarch64_handle_option (struct gcc_options *opts,
     }
 }
 
+/* -fsplit-stack uses a TCB field available on glibc-2.25.  GLIBC also
+   exports symbol, __tcb_private_ss, to signal it has the field available
+   on TCB allocation.  This aims to prevent binaries linked against newer
+   GLIBC to run on non-supported ones.  */
+
+static bool
+aarch64_supports_split_stack (bool report ATTRIBUTE_UNUSED,
+			      struct gcc_options *opts ATTRIBUTE_UNUSED)
+{
+  return true;
+}
+
+#undef TARGET_SUPPORTS_SPLIT_STACK
+#define TARGET_SUPPORTS_SPLIT_STACK aarch64_supports_split_stack
+
 struct gcc_targetm_common targetm_common = TARGETM_COMMON_INITIALIZER;
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -340,4 +355,3 @@  aarch64_rewrite_mcpu (int argc, const char **argv)
 }
 
 #undef AARCH64_CPU_NAME_LENGTH
-
diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
index 5fcaa59..ab3208b 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -80,8 +80,6 @@ 
     }						\
   while (0)
 
-#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
-
 /* Uninitialized common symbols in non-PIE executables, even with
    strong definitions in dependent shared libraries, will resolve
    to COPY relocated symbol in the executable.  See PR65780.  */
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 7f67f14..10bb045 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -377,6 +377,8 @@  void aarch64_err_no_fpadvsimd (machine_mode, const char *);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx);
 void aarch64_expand_prologue (void);
+void aarch64_expand_split_stack_prologue (void);
+void aarch64_split_stack_space_check (rtx, rtx);
 void aarch64_expand_vector_init (rtx, rtx);
 void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
 				   const_tree, unsigned);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1f35220..d2f809c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -895,7 +895,12 @@  aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
     output_asm_insn (buffer, operands);
 
-    snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
+    if (GET_CODE (operands[pos_label]) == LABEL_REF)
+      snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label,
+		label_ptr);
+    else
+      snprintf (buffer, sizeof (buffer), "b\t%%%d\n%s:", pos_label,
+		label_ptr);
     operands[pos_label] = dest_label;
     output_asm_insn (buffer, operands);
     return "";
@@ -3223,6 +3228,7 @@  aarch64_restore_callee_saves (machine_mode mode,
     }
 }
 
+
 /* AArch64 stack frames generated by this compiler look like:
 
 	+-------------------------------+
@@ -9831,7 +9837,7 @@  aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
   /* Emit code to initialize STACK, which points to the next varargs stack
      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
      by named arguments.  STACK is 8-byte aligned.  */
-  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
+  t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
   if (cum->aapcs_stack_size > 0)
     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
@@ -14265,6 +14271,161 @@  aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
     }
 }
 
+/* -fsplit-stack support.  */
+
+/* A SYMBOL_REF for __morestack.  */
+static GTY(()) rtx morestack_ref;
+
+/* Emit -fsplit-stack prologue, which goes before the regular function
+   prologue.  */
+
+void
+aarch64_expand_split_stack_prologue (void)
+{
+  rtx ssvalue, mem;
+  rtx reg10, reg11, reg12, cc, cmp, jump;
+  HOST_WIDE_INT allocate;
+  rtx_code_label *ok_label = NULL;
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  gcc_assert (flag_split_stack && reload_completed);
+
+  /* It limits total maximum stack allocation on 4G so its value can be
+     materialized using two instructions at most (movn/movk).  It might be
+     used by the linker to add some extra space for split calling non split
+     stack functions.  */
+  allocate = cfun->machine->frame.frame_size;
+  if (allocate > ((int64_t)1 << 32))
+    {
+      sorry ("Stack frame larger than 4G is not supported for -fsplit-stack");
+      return;
+    }
+
+  if (morestack_ref == NULL_RTX)
+    {
+      morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+      SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
+					   | SYMBOL_FLAG_FUNCTION);
+    }
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  /* Always emit two insns to calculate the requested stack, so the linker
+     can edit them when adjusting size for calling non-split-stack code.  */
+  reg10 = gen_rtx_REG (Pmode, R10_REGNUM);
+  int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate),
+					      true, Pmode);
+  gcc_assert (ninsn == 1 || ninsn == 2);
+  if (ninsn == 1)
+    {
+      if (allocate > 0)
+	emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0xffff0000)));
+      else
+	emit_insn (gen_insv_immdi (reg10, GEN_INT (0), GEN_INT (0x0)));
+    }
+  emit_insn (gen_add3_insn (reg10, stack_pointer_rtx, reg10));
+
+  ok_label = gen_label_rtx ();
+
+  /* If function uses stacked arguments save the old stack value so morestack
+     can return it.  */
+  reg11 = gen_rtx_REG (Pmode, R11_REGNUM);
+  if (cfun->machine->frame.saved_regs_size
+      || cfun->machine->frame.saved_varargs_size)
+    emit_move_insn (reg11, stack_pointer_rtx);
+
+  /* x12 holds the continuation address used to return to function.  */
+  reg12 = gen_rtx_REG (Pmode, R12_REGNUM);
+  aarch64_expand_mov_immediate (reg12, gen_rtx_LABEL_REF (VOIDmode, ok_label));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  cc = aarch64_gen_compare_reg (GE, ssvalue, reg10);
+  cmp = gen_rtx_fmt_ee (GE, VOIDmode, cc, const0_rtx);
+  jump = gen_split_stack_cond_call (morestack_ref, cmp, ok_label, reg12);
+
+  aarch64_emit_unlikely_jump (jump);
+  JUMP_LABEL (jump) = ok_label;
+  LABEL_NUSES (ok_label)++;
+
+  /* __morestack will call us here.  */
+  emit_label (ok_label);
+}
+
+/* Implement TARGET_ASM_FILE_END.  */
+
+static void
+aarch64_file_end (void)
+{
+  file_end_indicate_exec_stack ();
+
+  if (flag_split_stack)
+    file_end_indicate_split_stack ();
+}
+
+/* Return the internal arg pointer used for function incoming arguments.  */
+
+static rtx
+aarch64_internal_arg_pointer (void)
+{
+  if (flag_split_stack
+     && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
+         == NULL))
+    {
+      if (cfun->machine->frame.split_stack_arg_pointer == NULL_RTX)
+	{
+	  rtx pat;
+
+	  cfun->machine->frame.split_stack_arg_pointer = gen_reg_rtx (Pmode);
+	  REG_POINTER (cfun->machine->frame.split_stack_arg_pointer) = 1;
+
+	  /* Put the pseudo initialization right after the note at the
+	     beginning of the function.  */
+	  pat = gen_rtx_SET (cfun->machine->frame.split_stack_arg_pointer,
+			     gen_rtx_REG (Pmode, R11_REGNUM));
+	  push_topmost_sequence ();
+	  emit_insn_after (pat, get_insns ());
+	  pop_topmost_sequence ();
+	}
+      return plus_constant (Pmode, cfun->machine->frame.split_stack_arg_pointer,
+			    FIRST_PARM_OFFSET (current_function_decl));
+    }
+  return virtual_incoming_args_rtx;
+}
+
+/* Emit -fsplit-stack dynamic stack allocation space check.  */
+
+void
+aarch64_split_stack_space_check (rtx size, rtx label)
+{
+  rtx mem, ssvalue, cc, cmp, jump, temp;
+  rtx requested = gen_reg_rtx (Pmode);
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  temp = gen_rtx_REG (Pmode, R10_REGNUM);
+
+  /* And compare it with frame pointer plus required stack.  */
+  size = force_reg (Pmode, size);
+  emit_move_insn (requested, gen_rtx_MINUS (Pmode, stack_pointer_rtx, size));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  cc = aarch64_gen_compare_reg (LT, temp, ssvalue);
+  cmp = gen_rtx_fmt_ee (GEU, VOIDmode, cc, const0_rtx);
+  jump = emit_jump_insn (gen_condjump (cmp, cc, label));
+  JUMP_LABEL (jump) = label;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -14291,6 +14452,9 @@  aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 #undef TARGET_ASM_FILE_START
 #define TARGET_ASM_FILE_START aarch64_start_file
 
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END aarch64_file_end
+
 #undef TARGET_ASM_OUTPUT_MI_THUNK
 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
 
@@ -14373,6 +14537,9 @@  aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
 
+#undef TARGET_INTERNAL_ARG_POINTER
+#define TARGET_INTERNAL_ARG_POINTER aarch64_internal_arg_pointer
+
 #undef TARGET_GIMPLE_FOLD_BUILTIN
 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 584ff5c..a464c66 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -586,6 +586,9 @@  struct GTY (()) aarch64_frame
   unsigned wb_candidate2;
 
   bool laid_out;
+
+  /* Alternative internal arg pointer for -fsplit-stack.  */
+  rtx split_stack_arg_pointer;
 };
 
 typedef struct GTY (()) machine_function
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 3d21232..2d2492b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -130,6 +130,7 @@ 
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_STACK_CHECK
     UNSPEC_RSQRT
     UNSPEC_RSQRTE
     UNSPEC_RSQRTS
@@ -144,6 +145,7 @@ 
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
     UNSPECV_BLOCKAGE		; Represent a blockage
     UNSPECV_PROBE_STACK_RANGE	; Represent stack range probing.
+    UNSPECV_SPLIT_STACK_CALL    ; Represent a morestack call
   ]
 )
 
@@ -5415,3 +5417,58 @@ 
 
 ;; ldp/stp peephole patterns
 (include "aarch64-ldpstp.md")
+
+;; Handle -fsplit-stack
+(define_expand "split_stack_prologue"
+  [(const_int 0)]
+  ""
+{
+  aarch64_expand_split_stack_prologue ();
+  DONE;
+})
+
+;; If there are operand 0 bytes available on the stack, jump to
+;; operand 1.
+(define_expand "split_stack_space_check"
+  [(set (match_dup 2) (compare:CC (match_dup 3) (match_dup 2)))
+   (set (pc) (if_then_else
+	      (geu (match_dup 4) (const_int 0))
+	      (label_ref (match_operand 1))
+	      (pc)))]
+  ""
+{
+  aarch64_split_stack_space_check (operands[0], operands[1]);
+  DONE;
+})
+
+;; A __morestack call using branch
+
+(define_expand "split_stack_cond_call"
+  [(match_operand 0 "aarch64_call_insn_operand" "")
+   (match_operand 1 "" "")
+   (match_operand 2 "" "")
+   (match_operand 3 "" "")]
+  ""
+{
+  emit_jump_insn (gen_split_stack_cond_call_di (operands[0], operands[1],
+						operands[2], operands[3]));
+  DONE;
+})
+
+
+(define_insn "split_stack_cond_call_<mode>"
+  [(set (pc)
+        (if_then_else
+          (match_operand 1 "aarch64_comparison_operator" "")
+          (label_ref (match_operand 2 "" ""))
+          (pc)))
+   (set (reg:P 1) (unspec_volatile:P [(match_operand:P 0 "aarch64_call_insn_operand" "")
+                                    (reg:P 1)]
+                                   UNSPECV_SPLIT_STACK_CALL))
+   (use (match_operand:P 3 "register_operand" ""))]
+  ""
+  {
+    return aarch64_gen_far_branch (operands, 0, "Lbcond", "b%M1\\t");
+  }
+  [(set_attr "type" "branch")]
+)
diff --git a/libgcc/config.host b/libgcc/config.host
index e7e5413..18d9498 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -341,6 +341,7 @@  aarch64*-*-linux*)
 	md_unwind_header=aarch64/linux-unwind.h
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+	tmake_file="${tmake_file} t-stack aarch64/t-stack-aarch64"
 	;;
 alpha*-*-linux*)
 	tmake_file="${tmake_file} alpha/t-alpha alpha/t-ieee t-crtfm alpha/t-linux"
diff --git a/libgcc/config/aarch64/morestack-c.c b/libgcc/config/aarch64/morestack-c.c
new file mode 100644
index 0000000..8df7895
--- /dev/null
+++ b/libgcc/config/aarch64/morestack-c.c
@@ -0,0 +1,95 @@ 
+/* AArch64 support for -fsplit-stack.
+ * Copyright (C) 2016 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef inhibit_libc
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include "generic-morestack.h"
+
+/* This is based on GLIBC definition (version 2.24).  There is no need to
+   keep it sync since new fields are added on the end of structure and do
+   not change the '__private_ss' layout.  */
+typedef struct
+{
+  void *dtv;
+  void *private;
+  void *__private_ss;
+} tcbhead_t;
+
+#define INITIAL_STACK_SIZE  0x4000
+#define BACKOFF             0x1000
+
+void __generic_morestack_set_initial_sp (void *sp, size_t len);
+void *__morestack_get_guard (void);
+void __morestack_set_guard (void *);
+void *__morestack_make_guard (void *stack, size_t size);
+void __morestack_load_mmap (void);
+
+/* We declare is as weak so it fails either at stack linking or
+   at runtime if the GLIBC does not have the required TCB field.  */
+extern void __tcb_private_ss (void) __attribute__ ((weak));
+
+/* Initialize the stack guard when the program starts or when a new
+   thread.  This is called from a constructor using ctors section.  */
+void
+__stack_split_initialize (void)
+{
+  __tcb_private_ss ();
+
+  register void* sp __asm__ ("sp");
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = (void*)((uintptr_t)sp - INITIAL_STACK_SIZE);
+  return __generic_morestack_set_initial_sp (sp, INITIAL_STACK_SIZE);
+}
+
+/* Return current __private_ss.  */
+void *
+__morestack_get_guard (void)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  return tcb->__private_ss;
+}
+
+/* Set __private_ss to ptr.  */
+void
+__morestack_set_guard (void *ptr)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = ptr;
+}
+
+/* Return the stack guard value for given stack.  */
+void *
+__morestack_make_guard (void *stack, size_t size)
+{
+  return (void*)((uintptr_t)stack - size + BACKOFF);
+}
+
+/* Make __stack_split_initialize a high priority constructor.  */
+static void (*const ctors []) 
+  __attribute__ ((used, section (".ctors.65535"), aligned (sizeof (void *))))
+  = { __stack_split_initialize, __morestack_load_mmap };
+
+#endif /* !defined (inhibit_libc) */
diff --git a/libgcc/config/aarch64/morestack.S b/libgcc/config/aarch64/morestack.S
new file mode 100644
index 0000000..aac488d
--- /dev/null
+++ b/libgcc/config/aarch64/morestack.S
@@ -0,0 +1,254 @@ 
+# AArch64 support for -fsplit-stack.
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This file is part of GCC.
+
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+/* Define an entry point visible from C.  */
+#define ENTRY(name)						\
+  .globl name;							\
+  .type name,%function;						\
+  .align 4;							\
+  name##:
+
+#define END(name)						\
+  .size name,.-name
+
+/* __morestack frame size.  */
+#define MORESTACK_FRAMESIZE	112
+/* Offset from __morestack frame where the new stack size is saved and
+   passed to __generic_morestack.  */
+#define NEWSTACK_SAVE		96
+
+#define BACKOFF			0x1000
+# Large excess allocated when calling non-split-stack code.
+#define NON_SPLIT_STACK		0x100000
+
+# TCB offset of __private_ss
+#define TCB_PRIVATE_SS		#16
+
+	.text
+ENTRY(__morestack_non_split)
+	.cfi_startproc
+# We use a cleanup to restore the tcbhead_t.__private_ss if
+# an exception is thrown through this code.
+	add	x10, x10, NON_SPLIT_STACK
+	.cfi_endproc
+END(__morestack_non_split)
+# Fall through into __morestack
+
+# This function is called with non-standard calling conventions, on entry
+# x10 is the requested stack pointer.  The split-stack prologue is in the
+# form:
+#
+# function:
+#	mrs    x9, tpidr_el0
+#	sub    x10, sp, N & 0xfff000
+#	sub    x10, x10, N & 0xfff
+#	ldr    x9, [x9, 16]
+#	adrp   x12, main_fn_entry
+#	add    x12, x12, :lo12:main_fn_entry
+#	[mov   x11, sp]                # If function has stacked arguments
+#	cmp    x9, x10
+#	b.lt   main_fn_entry
+#	b      __morestack
+# main_fn_entry: [x11 is argument pointer]
+#	[prolog]
+#
+# The normal function prologue follows here, with a small addition at the
+# end to set up the argument pointer if required (the prolog):
+#
+#       [...]                  # default function prologue
+#	b.lt   function:
+# function:
+#
+#
+# The N bit is also restored to indicate that the function is called
+# (so the prologue addition can set up the argument pointer correctly).
+
+ENTRY(__morestack)
+.LFB1:
+	.cfi_startproc
+
+#ifdef __PIC__
+	.cfi_personality 0x9b,DW.ref.__gcc_personality_v0
+	.cfi_lsda 0x1b,.LLSDA1
+#else
+	.cfi_personality 0x3,__gcc_personality_v0
+	.cfi_lsda 0x3,.LLSDA1
+#endif
+
+	# Calculate requested stack size.
+	sub	x10, sp, x10
+	# Save parameters
+	stp	x29, x30, [sp, -MORESTACK_FRAMESIZE]!
+	.cfi_def_cfa_offset MORESTACK_FRAMESIZE
+	.cfi_offset 29, -MORESTACK_FRAMESIZE
+	.cfi_offset 30, -MORESTACK_FRAMESIZE+8
+	add	x29, sp, 0
+	.cfi_def_cfa_register 29
+	# Adjust the requested stack size for the frame pointer save.
+	stp	x0, x1, [sp, 16]
+	stp	x2, x3, [sp, 32]
+	add	x10, x10, BACKOFF
+	stp	x4, x5, [sp, 48]
+	stp	x6, x7, [sp, 64]
+	stp 	x8, x12, [sp, 80]
+	str	x10, [sp, 96]
+
+	# void __morestack_block_signals (void)
+	bl	__morestack_block_signals
+
+	# void *__generic_morestack (size_t *pframe_size,
+	#			     void *old_stack,
+	#			     size_t param_size)
+	# pframe_size: is the size of the required stack frame (the function
+	#	       amount of space remaining on the allocated stack).
+	# old_stack: points at the parameters the old stack
+	# param_size: size in bytes of parameters to copy to the new stack.
+	add	x0, x29, NEWSTACK_SAVE
+	add	x1, x29, MORESTACK_FRAMESIZE
+	mov	x2, 0
+	bl	__generic_morestack
+
+	# Start using new stack
+	mov	sp, x0
+
+	# Set __private_ss stack guard for the new stack.
+	ldr	x9, [x29, NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, x9
+.LEHB0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	# void __morestack_unblock_signals (void)
+	bl	__morestack_unblock_signals
+
+	# Set up for a call to the target function.
+	ldp	x0, x1, [x29, 16]
+	ldp	x2, x3, [x29, 32]
+	ldp	x4, x5, [x29, 48]
+	ldp	x6, x7, [x29, 64]
+	ldp	x8, x12, [x29, 80]
+	add	x11, x29, MORESTACK_FRAMESIZE
+	# Indicate __morestack was called.
+	cmp	x12, 0
+	blr	x12
+
+	stp	x0, x1, [x29, 16]
+	stp	x2, x3, [x29, 32]
+	stp	x4, x5, [x29, 48]
+	stp	x6, x7, [x29, 64]
+
+	bl	__morestack_block_signals
+
+	# void *__generic_releasestack (size_t *pavailable)
+	add	x0, x29, NEWSTACK_SAVE
+	bl	__generic_releasestack
+
+	# Reset __private_ss stack guard to value for old stack
+	ldr	x9, [x29, NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, x9
+
+	# Update TCB split stack field
+.LEHE0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	bl __morestack_unblock_signals
+
+	# Use old stack again.
+	add	sp, x29, MORESTACK_FRAMESIZE
+
+	ldp	x0, x1, [x29, 16]
+	ldp	x2, x3, [x29, 32]
+	ldp	x4, x5, [x29, 48]
+	ldp	x6, x7, [x29, 64]
+	ldp	x29, x30, [x29]
+
+	.cfi_remember_state
+	.cfi_restore 30
+	.cfi_restore 29
+	.cfi_def_cfa 31, 0
+
+	ret
+
+# This is the cleanup code called by the stack unwinder when
+# unwinding through code between .LEHB0 and .LEHE0 above.
+cleanup:
+	.cfi_restore_state
+	# Reuse the new stack allocation to save/restore the
+	# exception header
+	str	x0, [x29, NEWSTACK_SAVE]
+	# size_t __generic_findstack (void *stack)
+	add	x0, x29, MORESTACK_FRAMESIZE
+	bl	__generic_findstack
+	sub	x0, x29, x0
+	add	x0, x0, BACKOFF
+	# Restore tcbhead_t.__private_ss
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+	ldr	x0, [x29, NEWSTACK_SAVE]
+	b	_Unwind_Resume
+        .cfi_endproc
+END(__morestack)
+
+	.section .gcc_except_table,"a",@progbits
+	.align 4
+.LLSDA1:
+	# @LPStart format (omit)
+        .byte   0xff
+	# @TType format (omit)
+        .byte   0xff
+	# Call-site format (uleb128)
+        .byte   0x1
+	# Call-site table length
+        .uleb128 .LLSDACSE1-.LLSDACSB1
+.LLSDACSB1:
+	# region 0 start
+        .uleb128 .LEHB0-.LFB1
+	# length
+        .uleb128 .LEHE0-.LEHB0
+	# landing pad
+        .uleb128 cleanup-.LFB1
+	# no action (ie a cleanup)
+        .uleb128 0
+.LLSDACSE1:
+
+
+	.global __gcc_personality_v0
+#ifdef __PIC__
+	# Build a position independent reference to the personality function.
+	.hidden DW.ref.__gcc_personality_v0
+	.weak   DW.ref.__gcc_personality_v0
+	.section .data.DW.ref.__gcc_personality_v0,"awG",@progbits,DW.ref.__gcc_personality_v0,comdat
+	.type   DW.ref.__gcc_personality_v0, @object
+	.align 3
+DW.ref.__gcc_personality_v0:
+	.size   DW.ref.__gcc_personality_v0, 8
+	.quad   __gcc_personality_v0
+#endif
+
+	.section .note.GNU-stack,"",@progbits
+	.section .note.GNU-split-stack,"",@progbits
+	.section .note.GNU-no-split-stack,"",@progbits
diff --git a/libgcc/config/aarch64/t-stack-aarch64 b/libgcc/config/aarch64/t-stack-aarch64
new file mode 100644
index 0000000..4babb4e
--- /dev/null
+++ b/libgcc/config/aarch64/t-stack-aarch64
@@ -0,0 +1,3 @@ 
+# Makefile fragment to support -fsplit-stack for aarch64.
+LIB2ADD_ST += $(srcdir)/config/aarch64/morestack.S \
+	      $(srcdir)/config/aarch64/morestack-c.c
diff --git a/libgcc/generic-morestack.c b/libgcc/generic-morestack.c
index b8eec4e..fe7092b 100644
--- a/libgcc/generic-morestack.c
+++ b/libgcc/generic-morestack.c
@@ -943,6 +943,7 @@  __splitstack_find (void *segment_arg, void *sp, size_t *len,
       nsp -= 2 * 160;
 #elif defined __s390__
       nsp -= 2 * 96;
+#elif defined __aarch64__
 #else
 #error "unrecognized target"
 #endif
-- 
2.7.4