diff mbox

move increase_alignment from simple to regular ipa pass

Message ID CAAgBjMndb7=-CM4Jodi6+BRPkSxj9=KyeRUGCG+vsahNZoUghQ@mail.gmail.com
State New
Headers show

Commit Message

Prathamesh Kulkarni June 23, 2016, 5:21 p.m. UTC
On 17 June 2016 at 19:52, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> On 14 June 2016 at 18:31, Prathamesh Kulkarni

> <prathamesh.kulkarni@linaro.org> wrote:

>> On 13 June 2016 at 16:13, Jan Hubicka <hubicka@ucw.cz> wrote:

>>>> diff --git a/gcc/cgraph.h b/gcc/cgraph.h

>>>> index ecafe63..41ac408 100644

>>>> --- a/gcc/cgraph.h

>>>> +++ b/gcc/cgraph.h

>>>> @@ -1874,6 +1874,9 @@ public:

>>>>       if we did not do any inter-procedural code movement.  */

>>>>    unsigned used_by_single_function : 1;

>>>>

>>>> +  /* Set if -fsection-anchors is set.  */

>>>> +  unsigned section_anchor : 1;

>>>> +

>>>>  private:

>>>>    /* Assemble thunks and aliases associated to varpool node.  */

>>>>    void assemble_aliases (void);

>>>> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c

>>>> index 4bfcad7..e75d5c0 100644

>>>> --- a/gcc/cgraphunit.c

>>>> +++ b/gcc/cgraphunit.c

>>>> @@ -800,6 +800,9 @@ varpool_node::finalize_decl (tree decl)

>>>>       it is available to notice_global_symbol.  */

>>>>    node->definition = true;

>>>>    notice_global_symbol (decl);

>>>> +

>>>> +  node->section_anchor = flag_section_anchors;

>>>> +

>>>>    if (TREE_THIS_VOLATILE (decl) || DECL_PRESERVE_P (decl)

>>>>        /* Traditionally we do not eliminate static variables when not

>>>>        optimizing and when not doing toplevel reoder.  */

>>>> diff --git a/gcc/common.opt b/gcc/common.opt

>>>> index f0d7196..e497795 100644

>>>> --- a/gcc/common.opt

>>>> +++ b/gcc/common.opt

>>>> @@ -1590,6 +1590,10 @@ fira-algorithm=

>>>>  Common Joined RejectNegative Enum(ira_algorithm) Var(flag_ira_algorithm) Init(IRA_ALGORITHM_CB) Optimization

>>>>  -fira-algorithm=[CB|priority] Set the used IRA algorithm.

>>>>

>>>> +fipa-increase_alignment

>>>> +Common Report Var(flag_ipa_increase_alignment) Init(0) Optimization

>>>> +Option to gate increase_alignment ipa pass.

>>>> +

>>>>  Enum

>>>>  Name(ira_algorithm) Type(enum ira_algorithm) UnknownError(unknown IRA algorithm %qs)

>>>>

>>>> @@ -2133,7 +2137,7 @@ Common Report Var(flag_sched_dep_count_heuristic) Init(1) Optimization

>>>>  Enable the dependent count heuristic in the scheduler.

>>>>

>>>>  fsection-anchors

>>>> -Common Report Var(flag_section_anchors) Optimization

>>>> +Common Report Var(flag_section_anchors)

>>>>  Access data in the same section from shared anchor points.

>>>>

>>>>  fsee

>>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

>>>> index a0db3a4..1482566 100644

>>>> --- a/gcc/config/aarch64/aarch64.c

>>>> +++ b/gcc/config/aarch64/aarch64.c

>>>> @@ -8252,6 +8252,8 @@ aarch64_override_options (void)

>>>>

>>>>    aarch64_register_fma_steering ();

>>>>

>>>> +  /* Enable increase_alignment pass.  */

>>>> +  flag_ipa_increase_alignment = 1;

>>>

>>> I would rather enable it always on targets that do support anchors.

>> AFAIK aarch64 supports section anchors.

>>>> diff --git a/gcc/lto/lto-symtab.c b/gcc/lto/lto-symtab.c

>>>> index ce9e146..7f09f3a 100644

>>>> --- a/gcc/lto/lto-symtab.c

>>>> +++ b/gcc/lto/lto-symtab.c

>>>> @@ -342,6 +342,13 @@ lto_symtab_merge (symtab_node *prevailing, symtab_node *entry)

>>>>       The type compatibility checks or the completing of types has properly

>>>>       dealt with most issues.  */

>>>>

>>>> +  /* ??? is this assert necessary ?  */

>>>> +  varpool_node *v_prevailing = dyn_cast<varpool_node *> (prevailing);

>>>> +  varpool_node *v_entry = dyn_cast<varpool_node *> (entry);

>>>> +  gcc_assert (v_prevailing && v_entry);

>>>> +  /* section_anchor of prevailing_decl wins.  */

>>>> +  v_entry->section_anchor = v_prevailing->section_anchor;

>>>> +

>>> Other flags are merged in lto_varpool_replace_node so please move this there.

>> Ah indeed, thanks for the pointers.

>> I wonder though if we need to set

>> prevailing_node->section_anchor = vnode->section_anchor ?

>> IIUC, the function merges flags from vnode into prevailing_node

>> and removes vnode. However we want prevailing_node->section_anchor

>> to always take precedence.

>>>> +/* Return true if alignment should be increased for this vnode.

>>>> +   This is done if every function that references/referring to vnode

>>>> +   has flag_tree_loop_vectorize set.  */

>>>> +

>>>> +static bool

>>>> +increase_alignment_p (varpool_node *vnode)

>>>> +{

>>>> +  ipa_ref *ref;

>>>> +

>>>> +  for (int i = 0; vnode->iterate_reference (i, ref); i++)

>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referred))

>>>> +      {

>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>> +       return false;

>>>> +      }

>>>

>>> If you take address of function that has vectorizer enabled probably doesn't

>>> imply need to increase alignment of that var. So please drop the loop.

>>>

>>> You only want function that read/writes or takes address of the symbol. But

>>> onthe other hand, you need to walk all aliases of the symbol by

>>> call_for_symbol_and_aliases

>>>> +

>>>> +  for (int i = 0; vnode->iterate_referring (i, ref); i++)

>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referring))

>>>> +      {

>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>> +       return false;

>>>> +      }

>>>> +

>>>> +  return true;

>>>> +}

>>>> +

>>>>  /* Entry point to increase_alignment pass.  */

>>>>  static unsigned int

>>>>  increase_alignment (void)

>>>> @@ -914,9 +942,12 @@ increase_alignment (void)

>>>>        tree decl = vnode->decl;

>>>>        unsigned int alignment;

>>>>

>>>> -      if ((decl_in_symtab_p (decl)

>>>> -       && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>> -       || DECL_USER_ALIGN (decl) || DECL_ARTIFICIAL (decl))

>>>> +      if (!vnode->section_anchor

>>>> +       || (decl_in_symtab_p (decl)

>>>> +           && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>> +       || DECL_USER_ALIGN (decl)

>>>> +       || DECL_ARTIFICIAL (decl)

>>>> +       || !increase_alignment_p (vnode))

>>>

>>> Incrementally we probably should do more testing whether the variable looks like

>>> someting that can be vectorized, i.e. it contains array, has address taken or the

>>> accesses are array accesses within loop.

>>> This can be done by the analysis phase of the IPA pass inspecting the function

>>> bodies.

>> Thanks, I will try to check for array accesses are within a loop in

>> followup patch.

>> I was wondering if we could we treat a homogeneous global struct

>> (having members of one type),

>> as a global array of that type and increase it's alignment if required ?

>>>

>>> I think it is important waste to bump up everything including error messages etc.

>>> At least on i386 the effect on firefox datasegment of various alignment setting is

>>> very visible.

>> Um for a start, would it be OK to check if all functions referencing variable

>> have attribute noreturn, and in that case we skip increasing the alignment ?

>> I suppose that error functions would be having attribute noreturn set ?

>>>

>>> Looks OK to me otherwise. please send updated patch.

>> I have done the changes in the attached patch (stage-1 built).

>> I am not sure what to return from the callback function and

>> arbitrarily chose to return true.

> Hi,

> In this version (stage-1 built), I added read/write summaries which

> stream those variables

> which we want to increase alignment for.

>

> I have a few questions:

>

> a) To check if global var is used inside a loop, I obtained

> corresponding ipa_ref

> and checked loop_containing_stmt (ref->stmt), however it returned non-NULL

> even when ref->stmt was not inside a loop.

> for instance:

> int a[32];

> int f() { int x = a[0]; return x; }

> Um how to check if stmt is used inside a loop ?

>

> b) Is it necessary during WPA to check if function has

> flag_tree_vectorize_set since

> during analysis phase in increase_alignment_generate_summary() I check

> if cnode has flag_tree_loop_vectorize_set ?

>

> c) In LTO_increase_alignment_section, the following is streamed:

> i) a "count" of type uwhi, to represent number of variables

> ii) decls corresponding to the variables

> The variables are then obtained during read_summay using

> symtab_node::get_create().

> I suppose since decls for varpool_nodes would already be streamed in

> LTO_section_decls, I was wondering if I

> could somehow refer to the decls in that section to avoid duplicate

> streaming of decls ?

Hi,
In this version, the variable is streamed if it occurs within a loop
or it's address is taken.
To check if stmt is inside a loop I am using:

struct loop *l = loop_containing_stmt (ref->stmt);
if (l != DECL_STRUCT_FUNCTION (cnode->decl)->x_current_loops->tree_root)
  vars->add (vnode);
Is this correct ?

I came across an unexpected issue in my previous patch with -ffat-lto-objects.
I am allocating vars = new hash_set<varpool_node *> () in
generate_summary() and freeing it in write_summary().
However with -ffat-lto-objects, it appears execute() gets called after
write_summary()
during LGEN and we hit segfault in execute() at:
for (hash_set<varpool_node *>::iterator it = vars.begin (); it !=
vars.end (); it++)
  { ... }
because write_summary() has freed vars.
To workaround the issue, I gated call to free vars in write_summary on
flag_fat_lto_objects,
I am not sure if that's a good solution.

Cross tested on arm*-*-*, aarch64*-*-*.

Thanks,
Prathamesh
>

> Thanks,

> Prathamesh

>>

>> Thanks,

>> Prathamesh

>>>

>>> Honza

Comments

Prathamesh Kulkarni June 28, 2016, 9:19 a.m. UTC | #1
ping https://gcc.gnu.org/ml/gcc-patches/2016-06/msg01703.html

Thanks,
Prathamesh

On 23 June 2016 at 22:51, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> On 17 June 2016 at 19:52, Prathamesh Kulkarni

> <prathamesh.kulkarni@linaro.org> wrote:

>> On 14 June 2016 at 18:31, Prathamesh Kulkarni

>> <prathamesh.kulkarni@linaro.org> wrote:

>>> On 13 June 2016 at 16:13, Jan Hubicka <hubicka@ucw.cz> wrote:

>>>>> diff --git a/gcc/cgraph.h b/gcc/cgraph.h

>>>>> index ecafe63..41ac408 100644

>>>>> --- a/gcc/cgraph.h

>>>>> +++ b/gcc/cgraph.h

>>>>> @@ -1874,6 +1874,9 @@ public:

>>>>>       if we did not do any inter-procedural code movement.  */

>>>>>    unsigned used_by_single_function : 1;

>>>>>

>>>>> +  /* Set if -fsection-anchors is set.  */

>>>>> +  unsigned section_anchor : 1;

>>>>> +

>>>>>  private:

>>>>>    /* Assemble thunks and aliases associated to varpool node.  */

>>>>>    void assemble_aliases (void);

>>>>> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c

>>>>> index 4bfcad7..e75d5c0 100644

>>>>> --- a/gcc/cgraphunit.c

>>>>> +++ b/gcc/cgraphunit.c

>>>>> @@ -800,6 +800,9 @@ varpool_node::finalize_decl (tree decl)

>>>>>       it is available to notice_global_symbol.  */

>>>>>    node->definition = true;

>>>>>    notice_global_symbol (decl);

>>>>> +

>>>>> +  node->section_anchor = flag_section_anchors;

>>>>> +

>>>>>    if (TREE_THIS_VOLATILE (decl) || DECL_PRESERVE_P (decl)

>>>>>        /* Traditionally we do not eliminate static variables when not

>>>>>        optimizing and when not doing toplevel reoder.  */

>>>>> diff --git a/gcc/common.opt b/gcc/common.opt

>>>>> index f0d7196..e497795 100644

>>>>> --- a/gcc/common.opt

>>>>> +++ b/gcc/common.opt

>>>>> @@ -1590,6 +1590,10 @@ fira-algorithm=

>>>>>  Common Joined RejectNegative Enum(ira_algorithm) Var(flag_ira_algorithm) Init(IRA_ALGORITHM_CB) Optimization

>>>>>  -fira-algorithm=[CB|priority] Set the used IRA algorithm.

>>>>>

>>>>> +fipa-increase_alignment

>>>>> +Common Report Var(flag_ipa_increase_alignment) Init(0) Optimization

>>>>> +Option to gate increase_alignment ipa pass.

>>>>> +

>>>>>  Enum

>>>>>  Name(ira_algorithm) Type(enum ira_algorithm) UnknownError(unknown IRA algorithm %qs)

>>>>>

>>>>> @@ -2133,7 +2137,7 @@ Common Report Var(flag_sched_dep_count_heuristic) Init(1) Optimization

>>>>>  Enable the dependent count heuristic in the scheduler.

>>>>>

>>>>>  fsection-anchors

>>>>> -Common Report Var(flag_section_anchors) Optimization

>>>>> +Common Report Var(flag_section_anchors)

>>>>>  Access data in the same section from shared anchor points.

>>>>>

>>>>>  fsee

>>>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

>>>>> index a0db3a4..1482566 100644

>>>>> --- a/gcc/config/aarch64/aarch64.c

>>>>> +++ b/gcc/config/aarch64/aarch64.c

>>>>> @@ -8252,6 +8252,8 @@ aarch64_override_options (void)

>>>>>

>>>>>    aarch64_register_fma_steering ();

>>>>>

>>>>> +  /* Enable increase_alignment pass.  */

>>>>> +  flag_ipa_increase_alignment = 1;

>>>>

>>>> I would rather enable it always on targets that do support anchors.

>>> AFAIK aarch64 supports section anchors.

>>>>> diff --git a/gcc/lto/lto-symtab.c b/gcc/lto/lto-symtab.c

>>>>> index ce9e146..7f09f3a 100644

>>>>> --- a/gcc/lto/lto-symtab.c

>>>>> +++ b/gcc/lto/lto-symtab.c

>>>>> @@ -342,6 +342,13 @@ lto_symtab_merge (symtab_node *prevailing, symtab_node *entry)

>>>>>       The type compatibility checks or the completing of types has properly

>>>>>       dealt with most issues.  */

>>>>>

>>>>> +  /* ??? is this assert necessary ?  */

>>>>> +  varpool_node *v_prevailing = dyn_cast<varpool_node *> (prevailing);

>>>>> +  varpool_node *v_entry = dyn_cast<varpool_node *> (entry);

>>>>> +  gcc_assert (v_prevailing && v_entry);

>>>>> +  /* section_anchor of prevailing_decl wins.  */

>>>>> +  v_entry->section_anchor = v_prevailing->section_anchor;

>>>>> +

>>>> Other flags are merged in lto_varpool_replace_node so please move this there.

>>> Ah indeed, thanks for the pointers.

>>> I wonder though if we need to set

>>> prevailing_node->section_anchor = vnode->section_anchor ?

>>> IIUC, the function merges flags from vnode into prevailing_node

>>> and removes vnode. However we want prevailing_node->section_anchor

>>> to always take precedence.

>>>>> +/* Return true if alignment should be increased for this vnode.

>>>>> +   This is done if every function that references/referring to vnode

>>>>> +   has flag_tree_loop_vectorize set.  */

>>>>> +

>>>>> +static bool

>>>>> +increase_alignment_p (varpool_node *vnode)

>>>>> +{

>>>>> +  ipa_ref *ref;

>>>>> +

>>>>> +  for (int i = 0; vnode->iterate_reference (i, ref); i++)

>>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referred))

>>>>> +      {

>>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>>> +       return false;

>>>>> +      }

>>>>

>>>> If you take address of function that has vectorizer enabled probably doesn't

>>>> imply need to increase alignment of that var. So please drop the loop.

>>>>

>>>> You only want function that read/writes or takes address of the symbol. But

>>>> onthe other hand, you need to walk all aliases of the symbol by

>>>> call_for_symbol_and_aliases

>>>>> +

>>>>> +  for (int i = 0; vnode->iterate_referring (i, ref); i++)

>>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referring))

>>>>> +      {

>>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>>> +       return false;

>>>>> +      }

>>>>> +

>>>>> +  return true;

>>>>> +}

>>>>> +

>>>>>  /* Entry point to increase_alignment pass.  */

>>>>>  static unsigned int

>>>>>  increase_alignment (void)

>>>>> @@ -914,9 +942,12 @@ increase_alignment (void)

>>>>>        tree decl = vnode->decl;

>>>>>        unsigned int alignment;

>>>>>

>>>>> -      if ((decl_in_symtab_p (decl)

>>>>> -       && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>>> -       || DECL_USER_ALIGN (decl) || DECL_ARTIFICIAL (decl))

>>>>> +      if (!vnode->section_anchor

>>>>> +       || (decl_in_symtab_p (decl)

>>>>> +           && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>>> +       || DECL_USER_ALIGN (decl)

>>>>> +       || DECL_ARTIFICIAL (decl)

>>>>> +       || !increase_alignment_p (vnode))

>>>>

>>>> Incrementally we probably should do more testing whether the variable looks like

>>>> someting that can be vectorized, i.e. it contains array, has address taken or the

>>>> accesses are array accesses within loop.

>>>> This can be done by the analysis phase of the IPA pass inspecting the function

>>>> bodies.

>>> Thanks, I will try to check for array accesses are within a loop in

>>> followup patch.

>>> I was wondering if we could we treat a homogeneous global struct

>>> (having members of one type),

>>> as a global array of that type and increase it's alignment if required ?

>>>>

>>>> I think it is important waste to bump up everything including error messages etc.

>>>> At least on i386 the effect on firefox datasegment of various alignment setting is

>>>> very visible.

>>> Um for a start, would it be OK to check if all functions referencing variable

>>> have attribute noreturn, and in that case we skip increasing the alignment ?

>>> I suppose that error functions would be having attribute noreturn set ?

>>>>

>>>> Looks OK to me otherwise. please send updated patch.

>>> I have done the changes in the attached patch (stage-1 built).

>>> I am not sure what to return from the callback function and

>>> arbitrarily chose to return true.

>> Hi,

>> In this version (stage-1 built), I added read/write summaries which

>> stream those variables

>> which we want to increase alignment for.

>>

>> I have a few questions:

>>

>> a) To check if global var is used inside a loop, I obtained

>> corresponding ipa_ref

>> and checked loop_containing_stmt (ref->stmt), however it returned non-NULL

>> even when ref->stmt was not inside a loop.

>> for instance:

>> int a[32];

>> int f() { int x = a[0]; return x; }

>> Um how to check if stmt is used inside a loop ?

>>

>> b) Is it necessary during WPA to check if function has

>> flag_tree_vectorize_set since

>> during analysis phase in increase_alignment_generate_summary() I check

>> if cnode has flag_tree_loop_vectorize_set ?

>>

>> c) In LTO_increase_alignment_section, the following is streamed:

>> i) a "count" of type uwhi, to represent number of variables

>> ii) decls corresponding to the variables

>> The variables are then obtained during read_summay using

>> symtab_node::get_create().

>> I suppose since decls for varpool_nodes would already be streamed in

>> LTO_section_decls, I was wondering if I

>> could somehow refer to the decls in that section to avoid duplicate

>> streaming of decls ?

> Hi,

> In this version, the variable is streamed if it occurs within a loop

> or it's address is taken.

> To check if stmt is inside a loop I am using:

>

> struct loop *l = loop_containing_stmt (ref->stmt);

> if (l != DECL_STRUCT_FUNCTION (cnode->decl)->x_current_loops->tree_root)

>   vars->add (vnode);

> Is this correct ?

>

> I came across an unexpected issue in my previous patch with -ffat-lto-objects.

> I am allocating vars = new hash_set<varpool_node *> () in

> generate_summary() and freeing it in write_summary().

> However with -ffat-lto-objects, it appears execute() gets called after

> write_summary()

> during LGEN and we hit segfault in execute() at:

> for (hash_set<varpool_node *>::iterator it = vars.begin (); it !=

> vars.end (); it++)

>   { ... }

> because write_summary() has freed vars.

> To workaround the issue, I gated call to free vars in write_summary on

> flag_fat_lto_objects,

> I am not sure if that's a good solution.

>

> Cross tested on arm*-*-*, aarch64*-*-*.

>

> Thanks,

> Prathamesh

>>

>> Thanks,

>> Prathamesh

>>>

>>> Thanks,

>>> Prathamesh

>>>>

>>>> Honza
Prathamesh Kulkarni July 5, 2016, 9:53 a.m. UTC | #2
ping * 2 ping https://gcc.gnu.org/ml/gcc-patches/2016-06/msg01703.html

Thanks,
Prathamesh

On 28 June 2016 at 14:49, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> ping https://gcc.gnu.org/ml/gcc-patches/2016-06/msg01703.html

>

> Thanks,

> Prathamesh

>

> On 23 June 2016 at 22:51, Prathamesh Kulkarni

> <prathamesh.kulkarni@linaro.org> wrote:

>> On 17 June 2016 at 19:52, Prathamesh Kulkarni

>> <prathamesh.kulkarni@linaro.org> wrote:

>>> On 14 June 2016 at 18:31, Prathamesh Kulkarni

>>> <prathamesh.kulkarni@linaro.org> wrote:

>>>> On 13 June 2016 at 16:13, Jan Hubicka <hubicka@ucw.cz> wrote:

>>>>>> diff --git a/gcc/cgraph.h b/gcc/cgraph.h

>>>>>> index ecafe63..41ac408 100644

>>>>>> --- a/gcc/cgraph.h

>>>>>> +++ b/gcc/cgraph.h

>>>>>> @@ -1874,6 +1874,9 @@ public:

>>>>>>       if we did not do any inter-procedural code movement.  */

>>>>>>    unsigned used_by_single_function : 1;

>>>>>>

>>>>>> +  /* Set if -fsection-anchors is set.  */

>>>>>> +  unsigned section_anchor : 1;

>>>>>> +

>>>>>>  private:

>>>>>>    /* Assemble thunks and aliases associated to varpool node.  */

>>>>>>    void assemble_aliases (void);

>>>>>> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c

>>>>>> index 4bfcad7..e75d5c0 100644

>>>>>> --- a/gcc/cgraphunit.c

>>>>>> +++ b/gcc/cgraphunit.c

>>>>>> @@ -800,6 +800,9 @@ varpool_node::finalize_decl (tree decl)

>>>>>>       it is available to notice_global_symbol.  */

>>>>>>    node->definition = true;

>>>>>>    notice_global_symbol (decl);

>>>>>> +

>>>>>> +  node->section_anchor = flag_section_anchors;

>>>>>> +

>>>>>>    if (TREE_THIS_VOLATILE (decl) || DECL_PRESERVE_P (decl)

>>>>>>        /* Traditionally we do not eliminate static variables when not

>>>>>>        optimizing and when not doing toplevel reoder.  */

>>>>>> diff --git a/gcc/common.opt b/gcc/common.opt

>>>>>> index f0d7196..e497795 100644

>>>>>> --- a/gcc/common.opt

>>>>>> +++ b/gcc/common.opt

>>>>>> @@ -1590,6 +1590,10 @@ fira-algorithm=

>>>>>>  Common Joined RejectNegative Enum(ira_algorithm) Var(flag_ira_algorithm) Init(IRA_ALGORITHM_CB) Optimization

>>>>>>  -fira-algorithm=[CB|priority] Set the used IRA algorithm.

>>>>>>

>>>>>> +fipa-increase_alignment

>>>>>> +Common Report Var(flag_ipa_increase_alignment) Init(0) Optimization

>>>>>> +Option to gate increase_alignment ipa pass.

>>>>>> +

>>>>>>  Enum

>>>>>>  Name(ira_algorithm) Type(enum ira_algorithm) UnknownError(unknown IRA algorithm %qs)

>>>>>>

>>>>>> @@ -2133,7 +2137,7 @@ Common Report Var(flag_sched_dep_count_heuristic) Init(1) Optimization

>>>>>>  Enable the dependent count heuristic in the scheduler.

>>>>>>

>>>>>>  fsection-anchors

>>>>>> -Common Report Var(flag_section_anchors) Optimization

>>>>>> +Common Report Var(flag_section_anchors)

>>>>>>  Access data in the same section from shared anchor points.

>>>>>>

>>>>>>  fsee

>>>>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

>>>>>> index a0db3a4..1482566 100644

>>>>>> --- a/gcc/config/aarch64/aarch64.c

>>>>>> +++ b/gcc/config/aarch64/aarch64.c

>>>>>> @@ -8252,6 +8252,8 @@ aarch64_override_options (void)

>>>>>>

>>>>>>    aarch64_register_fma_steering ();

>>>>>>

>>>>>> +  /* Enable increase_alignment pass.  */

>>>>>> +  flag_ipa_increase_alignment = 1;

>>>>>

>>>>> I would rather enable it always on targets that do support anchors.

>>>> AFAIK aarch64 supports section anchors.

>>>>>> diff --git a/gcc/lto/lto-symtab.c b/gcc/lto/lto-symtab.c

>>>>>> index ce9e146..7f09f3a 100644

>>>>>> --- a/gcc/lto/lto-symtab.c

>>>>>> +++ b/gcc/lto/lto-symtab.c

>>>>>> @@ -342,6 +342,13 @@ lto_symtab_merge (symtab_node *prevailing, symtab_node *entry)

>>>>>>       The type compatibility checks or the completing of types has properly

>>>>>>       dealt with most issues.  */

>>>>>>

>>>>>> +  /* ??? is this assert necessary ?  */

>>>>>> +  varpool_node *v_prevailing = dyn_cast<varpool_node *> (prevailing);

>>>>>> +  varpool_node *v_entry = dyn_cast<varpool_node *> (entry);

>>>>>> +  gcc_assert (v_prevailing && v_entry);

>>>>>> +  /* section_anchor of prevailing_decl wins.  */

>>>>>> +  v_entry->section_anchor = v_prevailing->section_anchor;

>>>>>> +

>>>>> Other flags are merged in lto_varpool_replace_node so please move this there.

>>>> Ah indeed, thanks for the pointers.

>>>> I wonder though if we need to set

>>>> prevailing_node->section_anchor = vnode->section_anchor ?

>>>> IIUC, the function merges flags from vnode into prevailing_node

>>>> and removes vnode. However we want prevailing_node->section_anchor

>>>> to always take precedence.

>>>>>> +/* Return true if alignment should be increased for this vnode.

>>>>>> +   This is done if every function that references/referring to vnode

>>>>>> +   has flag_tree_loop_vectorize set.  */

>>>>>> +

>>>>>> +static bool

>>>>>> +increase_alignment_p (varpool_node *vnode)

>>>>>> +{

>>>>>> +  ipa_ref *ref;

>>>>>> +

>>>>>> +  for (int i = 0; vnode->iterate_reference (i, ref); i++)

>>>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referred))

>>>>>> +      {

>>>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>>>> +       return false;

>>>>>> +      }

>>>>>

>>>>> If you take address of function that has vectorizer enabled probably doesn't

>>>>> imply need to increase alignment of that var. So please drop the loop.

>>>>>

>>>>> You only want function that read/writes or takes address of the symbol. But

>>>>> onthe other hand, you need to walk all aliases of the symbol by

>>>>> call_for_symbol_and_aliases

>>>>>> +

>>>>>> +  for (int i = 0; vnode->iterate_referring (i, ref); i++)

>>>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referring))

>>>>>> +      {

>>>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>>>> +       return false;

>>>>>> +      }

>>>>>> +

>>>>>> +  return true;

>>>>>> +}

>>>>>> +

>>>>>>  /* Entry point to increase_alignment pass.  */

>>>>>>  static unsigned int

>>>>>>  increase_alignment (void)

>>>>>> @@ -914,9 +942,12 @@ increase_alignment (void)

>>>>>>        tree decl = vnode->decl;

>>>>>>        unsigned int alignment;

>>>>>>

>>>>>> -      if ((decl_in_symtab_p (decl)

>>>>>> -       && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>>>> -       || DECL_USER_ALIGN (decl) || DECL_ARTIFICIAL (decl))

>>>>>> +      if (!vnode->section_anchor

>>>>>> +       || (decl_in_symtab_p (decl)

>>>>>> +           && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>>>> +       || DECL_USER_ALIGN (decl)

>>>>>> +       || DECL_ARTIFICIAL (decl)

>>>>>> +       || !increase_alignment_p (vnode))

>>>>>

>>>>> Incrementally we probably should do more testing whether the variable looks like

>>>>> someting that can be vectorized, i.e. it contains array, has address taken or the

>>>>> accesses are array accesses within loop.

>>>>> This can be done by the analysis phase of the IPA pass inspecting the function

>>>>> bodies.

>>>> Thanks, I will try to check for array accesses are within a loop in

>>>> followup patch.

>>>> I was wondering if we could we treat a homogeneous global struct

>>>> (having members of one type),

>>>> as a global array of that type and increase it's alignment if required ?

>>>>>

>>>>> I think it is important waste to bump up everything including error messages etc.

>>>>> At least on i386 the effect on firefox datasegment of various alignment setting is

>>>>> very visible.

>>>> Um for a start, would it be OK to check if all functions referencing variable

>>>> have attribute noreturn, and in that case we skip increasing the alignment ?

>>>> I suppose that error functions would be having attribute noreturn set ?

>>>>>

>>>>> Looks OK to me otherwise. please send updated patch.

>>>> I have done the changes in the attached patch (stage-1 built).

>>>> I am not sure what to return from the callback function and

>>>> arbitrarily chose to return true.

>>> Hi,

>>> In this version (stage-1 built), I added read/write summaries which

>>> stream those variables

>>> which we want to increase alignment for.

>>>

>>> I have a few questions:

>>>

>>> a) To check if global var is used inside a loop, I obtained

>>> corresponding ipa_ref

>>> and checked loop_containing_stmt (ref->stmt), however it returned non-NULL

>>> even when ref->stmt was not inside a loop.

>>> for instance:

>>> int a[32];

>>> int f() { int x = a[0]; return x; }

>>> Um how to check if stmt is used inside a loop ?

>>>

>>> b) Is it necessary during WPA to check if function has

>>> flag_tree_vectorize_set since

>>> during analysis phase in increase_alignment_generate_summary() I check

>>> if cnode has flag_tree_loop_vectorize_set ?

>>>

>>> c) In LTO_increase_alignment_section, the following is streamed:

>>> i) a "count" of type uwhi, to represent number of variables

>>> ii) decls corresponding to the variables

>>> The variables are then obtained during read_summay using

>>> symtab_node::get_create().

>>> I suppose since decls for varpool_nodes would already be streamed in

>>> LTO_section_decls, I was wondering if I

>>> could somehow refer to the decls in that section to avoid duplicate

>>> streaming of decls ?

>> Hi,

>> In this version, the variable is streamed if it occurs within a loop

>> or it's address is taken.

>> To check if stmt is inside a loop I am using:

>>

>> struct loop *l = loop_containing_stmt (ref->stmt);

>> if (l != DECL_STRUCT_FUNCTION (cnode->decl)->x_current_loops->tree_root)

>>   vars->add (vnode);

>> Is this correct ?

>>

>> I came across an unexpected issue in my previous patch with -ffat-lto-objects.

>> I am allocating vars = new hash_set<varpool_node *> () in

>> generate_summary() and freeing it in write_summary().

>> However with -ffat-lto-objects, it appears execute() gets called after

>> write_summary()

>> during LGEN and we hit segfault in execute() at:

>> for (hash_set<varpool_node *>::iterator it = vars.begin (); it !=

>> vars.end (); it++)

>>   { ... }

>> because write_summary() has freed vars.

>> To workaround the issue, I gated call to free vars in write_summary on

>> flag_fat_lto_objects,

>> I am not sure if that's a good solution.

>>

>> Cross tested on arm*-*-*, aarch64*-*-*.

>>

>> Thanks,

>> Prathamesh

>>>

>>> Thanks,

>>> Prathamesh

>>>>

>>>> Thanks,

>>>> Prathamesh

>>>>>

>>>>> Honza
Prathamesh Kulkarni July 20, 2016, 11:49 a.m. UTC | #3
ping * 3 https://gcc.gnu.org/ml/gcc-patches/2016-06/msg01703.html

Thanks,
Prathamesh

On 5 July 2016 at 10:53, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> ping * 2 ping https://gcc.gnu.org/ml/gcc-patches/2016-06/msg01703.html

>

> Thanks,

> Prathamesh

>

> On 28 June 2016 at 14:49, Prathamesh Kulkarni

> <prathamesh.kulkarni@linaro.org> wrote:

>> ping https://gcc.gnu.org/ml/gcc-patches/2016-06/msg01703.html

>>

>> Thanks,

>> Prathamesh

>>

>> On 23 June 2016 at 22:51, Prathamesh Kulkarni

>> <prathamesh.kulkarni@linaro.org> wrote:

>>> On 17 June 2016 at 19:52, Prathamesh Kulkarni

>>> <prathamesh.kulkarni@linaro.org> wrote:

>>>> On 14 June 2016 at 18:31, Prathamesh Kulkarni

>>>> <prathamesh.kulkarni@linaro.org> wrote:

>>>>> On 13 June 2016 at 16:13, Jan Hubicka <hubicka@ucw.cz> wrote:

>>>>>>> diff --git a/gcc/cgraph.h b/gcc/cgraph.h

>>>>>>> index ecafe63..41ac408 100644

>>>>>>> --- a/gcc/cgraph.h

>>>>>>> +++ b/gcc/cgraph.h

>>>>>>> @@ -1874,6 +1874,9 @@ public:

>>>>>>>       if we did not do any inter-procedural code movement.  */

>>>>>>>    unsigned used_by_single_function : 1;

>>>>>>>

>>>>>>> +  /* Set if -fsection-anchors is set.  */

>>>>>>> +  unsigned section_anchor : 1;

>>>>>>> +

>>>>>>>  private:

>>>>>>>    /* Assemble thunks and aliases associated to varpool node.  */

>>>>>>>    void assemble_aliases (void);

>>>>>>> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c

>>>>>>> index 4bfcad7..e75d5c0 100644

>>>>>>> --- a/gcc/cgraphunit.c

>>>>>>> +++ b/gcc/cgraphunit.c

>>>>>>> @@ -800,6 +800,9 @@ varpool_node::finalize_decl (tree decl)

>>>>>>>       it is available to notice_global_symbol.  */

>>>>>>>    node->definition = true;

>>>>>>>    notice_global_symbol (decl);

>>>>>>> +

>>>>>>> +  node->section_anchor = flag_section_anchors;

>>>>>>> +

>>>>>>>    if (TREE_THIS_VOLATILE (decl) || DECL_PRESERVE_P (decl)

>>>>>>>        /* Traditionally we do not eliminate static variables when not

>>>>>>>        optimizing and when not doing toplevel reoder.  */

>>>>>>> diff --git a/gcc/common.opt b/gcc/common.opt

>>>>>>> index f0d7196..e497795 100644

>>>>>>> --- a/gcc/common.opt

>>>>>>> +++ b/gcc/common.opt

>>>>>>> @@ -1590,6 +1590,10 @@ fira-algorithm=

>>>>>>>  Common Joined RejectNegative Enum(ira_algorithm) Var(flag_ira_algorithm) Init(IRA_ALGORITHM_CB) Optimization

>>>>>>>  -fira-algorithm=[CB|priority] Set the used IRA algorithm.

>>>>>>>

>>>>>>> +fipa-increase_alignment

>>>>>>> +Common Report Var(flag_ipa_increase_alignment) Init(0) Optimization

>>>>>>> +Option to gate increase_alignment ipa pass.

>>>>>>> +

>>>>>>>  Enum

>>>>>>>  Name(ira_algorithm) Type(enum ira_algorithm) UnknownError(unknown IRA algorithm %qs)

>>>>>>>

>>>>>>> @@ -2133,7 +2137,7 @@ Common Report Var(flag_sched_dep_count_heuristic) Init(1) Optimization

>>>>>>>  Enable the dependent count heuristic in the scheduler.

>>>>>>>

>>>>>>>  fsection-anchors

>>>>>>> -Common Report Var(flag_section_anchors) Optimization

>>>>>>> +Common Report Var(flag_section_anchors)

>>>>>>>  Access data in the same section from shared anchor points.

>>>>>>>

>>>>>>>  fsee

>>>>>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

>>>>>>> index a0db3a4..1482566 100644

>>>>>>> --- a/gcc/config/aarch64/aarch64.c

>>>>>>> +++ b/gcc/config/aarch64/aarch64.c

>>>>>>> @@ -8252,6 +8252,8 @@ aarch64_override_options (void)

>>>>>>>

>>>>>>>    aarch64_register_fma_steering ();

>>>>>>>

>>>>>>> +  /* Enable increase_alignment pass.  */

>>>>>>> +  flag_ipa_increase_alignment = 1;

>>>>>>

>>>>>> I would rather enable it always on targets that do support anchors.

>>>>> AFAIK aarch64 supports section anchors.

>>>>>>> diff --git a/gcc/lto/lto-symtab.c b/gcc/lto/lto-symtab.c

>>>>>>> index ce9e146..7f09f3a 100644

>>>>>>> --- a/gcc/lto/lto-symtab.c

>>>>>>> +++ b/gcc/lto/lto-symtab.c

>>>>>>> @@ -342,6 +342,13 @@ lto_symtab_merge (symtab_node *prevailing, symtab_node *entry)

>>>>>>>       The type compatibility checks or the completing of types has properly

>>>>>>>       dealt with most issues.  */

>>>>>>>

>>>>>>> +  /* ??? is this assert necessary ?  */

>>>>>>> +  varpool_node *v_prevailing = dyn_cast<varpool_node *> (prevailing);

>>>>>>> +  varpool_node *v_entry = dyn_cast<varpool_node *> (entry);

>>>>>>> +  gcc_assert (v_prevailing && v_entry);

>>>>>>> +  /* section_anchor of prevailing_decl wins.  */

>>>>>>> +  v_entry->section_anchor = v_prevailing->section_anchor;

>>>>>>> +

>>>>>> Other flags are merged in lto_varpool_replace_node so please move this there.

>>>>> Ah indeed, thanks for the pointers.

>>>>> I wonder though if we need to set

>>>>> prevailing_node->section_anchor = vnode->section_anchor ?

>>>>> IIUC, the function merges flags from vnode into prevailing_node

>>>>> and removes vnode. However we want prevailing_node->section_anchor

>>>>> to always take precedence.

>>>>>>> +/* Return true if alignment should be increased for this vnode.

>>>>>>> +   This is done if every function that references/referring to vnode

>>>>>>> +   has flag_tree_loop_vectorize set.  */

>>>>>>> +

>>>>>>> +static bool

>>>>>>> +increase_alignment_p (varpool_node *vnode)

>>>>>>> +{

>>>>>>> +  ipa_ref *ref;

>>>>>>> +

>>>>>>> +  for (int i = 0; vnode->iterate_reference (i, ref); i++)

>>>>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referred))

>>>>>>> +      {

>>>>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>>>>> +       return false;

>>>>>>> +      }

>>>>>>

>>>>>> If you take address of function that has vectorizer enabled probably doesn't

>>>>>> imply need to increase alignment of that var. So please drop the loop.

>>>>>>

>>>>>> You only want function that read/writes or takes address of the symbol. But

>>>>>> onthe other hand, you need to walk all aliases of the symbol by

>>>>>> call_for_symbol_and_aliases

>>>>>>> +

>>>>>>> +  for (int i = 0; vnode->iterate_referring (i, ref); i++)

>>>>>>> +    if (cgraph_node *cnode = dyn_cast<cgraph_node *> (ref->referring))

>>>>>>> +      {

>>>>>>> +     struct cl_optimization *opts = opts_for_fn (cnode->decl);

>>>>>>> +     if (!opts->x_flag_tree_loop_vectorize)

>>>>>>> +       return false;

>>>>>>> +      }

>>>>>>> +

>>>>>>> +  return true;

>>>>>>> +}

>>>>>>> +

>>>>>>>  /* Entry point to increase_alignment pass.  */

>>>>>>>  static unsigned int

>>>>>>>  increase_alignment (void)

>>>>>>> @@ -914,9 +942,12 @@ increase_alignment (void)

>>>>>>>        tree decl = vnode->decl;

>>>>>>>        unsigned int alignment;

>>>>>>>

>>>>>>> -      if ((decl_in_symtab_p (decl)

>>>>>>> -       && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>>>>> -       || DECL_USER_ALIGN (decl) || DECL_ARTIFICIAL (decl))

>>>>>>> +      if (!vnode->section_anchor

>>>>>>> +       || (decl_in_symtab_p (decl)

>>>>>>> +           && !symtab_node::get (decl)->can_increase_alignment_p ())

>>>>>>> +       || DECL_USER_ALIGN (decl)

>>>>>>> +       || DECL_ARTIFICIAL (decl)

>>>>>>> +       || !increase_alignment_p (vnode))

>>>>>>

>>>>>> Incrementally we probably should do more testing whether the variable looks like

>>>>>> someting that can be vectorized, i.e. it contains array, has address taken or the

>>>>>> accesses are array accesses within loop.

>>>>>> This can be done by the analysis phase of the IPA pass inspecting the function

>>>>>> bodies.

>>>>> Thanks, I will try to check for array accesses are within a loop in

>>>>> followup patch.

>>>>> I was wondering if we could we treat a homogeneous global struct

>>>>> (having members of one type),

>>>>> as a global array of that type and increase it's alignment if required ?

>>>>>>

>>>>>> I think it is important waste to bump up everything including error messages etc.

>>>>>> At least on i386 the effect on firefox datasegment of various alignment setting is

>>>>>> very visible.

>>>>> Um for a start, would it be OK to check if all functions referencing variable

>>>>> have attribute noreturn, and in that case we skip increasing the alignment ?

>>>>> I suppose that error functions would be having attribute noreturn set ?

>>>>>>

>>>>>> Looks OK to me otherwise. please send updated patch.

>>>>> I have done the changes in the attached patch (stage-1 built).

>>>>> I am not sure what to return from the callback function and

>>>>> arbitrarily chose to return true.

>>>> Hi,

>>>> In this version (stage-1 built), I added read/write summaries which

>>>> stream those variables

>>>> which we want to increase alignment for.

>>>>

>>>> I have a few questions:

>>>>

>>>> a) To check if global var is used inside a loop, I obtained

>>>> corresponding ipa_ref

>>>> and checked loop_containing_stmt (ref->stmt), however it returned non-NULL

>>>> even when ref->stmt was not inside a loop.

>>>> for instance:

>>>> int a[32];

>>>> int f() { int x = a[0]; return x; }

>>>> Um how to check if stmt is used inside a loop ?

>>>>

>>>> b) Is it necessary during WPA to check if function has

>>>> flag_tree_vectorize_set since

>>>> during analysis phase in increase_alignment_generate_summary() I check

>>>> if cnode has flag_tree_loop_vectorize_set ?

>>>>

>>>> c) In LTO_increase_alignment_section, the following is streamed:

>>>> i) a "count" of type uwhi, to represent number of variables

>>>> ii) decls corresponding to the variables

>>>> The variables are then obtained during read_summay using

>>>> symtab_node::get_create().

>>>> I suppose since decls for varpool_nodes would already be streamed in

>>>> LTO_section_decls, I was wondering if I

>>>> could somehow refer to the decls in that section to avoid duplicate

>>>> streaming of decls ?

>>> Hi,

>>> In this version, the variable is streamed if it occurs within a loop

>>> or it's address is taken.

>>> To check if stmt is inside a loop I am using:

>>>

>>> struct loop *l = loop_containing_stmt (ref->stmt);

>>> if (l != DECL_STRUCT_FUNCTION (cnode->decl)->x_current_loops->tree_root)

>>>   vars->add (vnode);

>>> Is this correct ?

>>>

>>> I came across an unexpected issue in my previous patch with -ffat-lto-objects.

>>> I am allocating vars = new hash_set<varpool_node *> () in

>>> generate_summary() and freeing it in write_summary().

>>> However with -ffat-lto-objects, it appears execute() gets called after

>>> write_summary()

>>> during LGEN and we hit segfault in execute() at:

>>> for (hash_set<varpool_node *>::iterator it = vars.begin (); it !=

>>> vars.end (); it++)

>>>   { ... }

>>> because write_summary() has freed vars.

>>> To workaround the issue, I gated call to free vars in write_summary on

>>> flag_fat_lto_objects,

>>> I am not sure if that's a good solution.

>>>

>>> Cross tested on arm*-*-*, aarch64*-*-*.

>>>

>>> Thanks,

>>> Prathamesh

>>>>

>>>> Thanks,

>>>> Prathamesh

>>>>>

>>>>> Thanks,

>>>>> Prathamesh

>>>>>>

>>>>>> Honza
diff mbox

Patch

diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index ecafe63..41ac408 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -1874,6 +1874,9 @@  public:
      if we did not do any inter-procedural code movement.  */
   unsigned used_by_single_function : 1;
 
+  /* Set if -fsection-anchors is set.  */
+  unsigned section_anchor : 1;
+
 private:
   /* Assemble thunks and aliases associated to varpool node.  */
   void assemble_aliases (void);
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 4bfcad7..e75d5c0 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -800,6 +800,9 @@  varpool_node::finalize_decl (tree decl)
      it is available to notice_global_symbol.  */
   node->definition = true;
   notice_global_symbol (decl);
+
+  node->section_anchor = flag_section_anchors;
+
   if (TREE_THIS_VOLATILE (decl) || DECL_PRESERVE_P (decl)
       /* Traditionally we do not eliminate static variables when not
 	 optimizing and when not doing toplevel reoder.  */
diff --git a/gcc/common.opt b/gcc/common.opt
index f0d7196..e497795 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1590,6 +1590,10 @@  fira-algorithm=
 Common Joined RejectNegative Enum(ira_algorithm) Var(flag_ira_algorithm) Init(IRA_ALGORITHM_CB) Optimization
 -fira-algorithm=[CB|priority] Set the used IRA algorithm.
 
+fipa-increase_alignment
+Common Report Var(flag_ipa_increase_alignment) Init(0) Optimization
+Option to gate increase_alignment ipa pass.
+
 Enum
 Name(ira_algorithm) Type(enum ira_algorithm) UnknownError(unknown IRA algorithm %qs)
 
@@ -2133,7 +2137,7 @@  Common Report Var(flag_sched_dep_count_heuristic) Init(1) Optimization
 Enable the dependent count heuristic in the scheduler.
 
 fsection-anchors
-Common Report Var(flag_section_anchors) Optimization
+Common Report Var(flag_section_anchors)
 Access data in the same section from shared anchor points.
 
 fsee
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a0db3a4..1482566 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8252,6 +8252,8 @@  aarch64_override_options (void)
 
   aarch64_register_fma_steering ();
 
+  /* Enable increase_alignment pass.  */
+  flag_ipa_increase_alignment = 1;
 }
 
 /* Implement targetm.override_options_after_change.  */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 3503c15..b7f448e 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3458,6 +3458,9 @@  arm_option_override (void)
 
   /* Init initial mode for testing.  */
   thumb_flipper = TARGET_THUMB;
+
+  /* Enable increase_alignment pass.  */
+  flag_ipa_increase_alignment = 1;
 }
 
 static void
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 2d7df6b..ed59068 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5011,6 +5011,9 @@  rs6000_option_override (void)
     = { pass_analyze_swaps, "cse1", 1, PASS_POS_INSERT_BEFORE };
 
   register_pass (&analyze_swaps_info);
+
+  /* Enable increase_alignment pass.  */
+  flag_ipa_increase_alignment = 1;
 }
 
 
diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 5cef2ba..289d9c3 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -627,6 +627,7 @@  lto_output_varpool_node (struct lto_simple_output_block *ob, varpool_node *node,
   bp_pack_value (&bp, node->tls_model, 3);
   bp_pack_value (&bp, node->used_by_single_function, 1);
   bp_pack_value (&bp, node->need_bounds_init, 1);
+  bp_pack_value (&bp, node->section_anchor, 1);
   streamer_write_bitpack (&bp);
 
   group = node->get_comdat_group ();
@@ -1401,6 +1402,7 @@  input_varpool_node (struct lto_file_decl_data *file_data,
   node->tls_model = (enum tls_model)bp_unpack_value (&bp, 3);
   node->used_by_single_function = (enum tls_model)bp_unpack_value (&bp, 1);
   node->need_bounds_init = bp_unpack_value (&bp, 1);
+  node->section_anchor = bp_unpack_value (&bp, 1);
   group = read_identifier (ib);
   if (group)
     {
diff --git a/gcc/lto-section-in.c b/gcc/lto-section-in.c
index d8e74d7..d56f9d4 100644
--- a/gcc/lto-section-in.c
+++ b/gcc/lto-section-in.c
@@ -52,7 +52,8 @@  const char *lto_section_name[LTO_N_SECTION_TYPES] =
   "icf",
   "offload_table",
   "mode_table",
-  "hsa"
+  "hsa",
+  "increase_alignment"
 };
 
 
diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
index 92efdb8..725d0dc 100644
--- a/gcc/lto-streamer.h
+++ b/gcc/lto-streamer.h
@@ -245,6 +245,7 @@  enum lto_section_type
   LTO_section_offload_table,
   LTO_section_mode_table,
   LTO_section_ipa_hsa,
+  LTO_section_increase_alignment,
   LTO_N_SECTION_TYPES		/* Must be last.  */
 };
 
diff --git a/gcc/passes.def b/gcc/passes.def
index 3647e90..3a8063c 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -138,12 +138,12 @@  along with GCC; see the file COPYING3.  If not see
   PUSH_INSERT_PASSES_WITHIN (pass_ipa_tree_profile)
       NEXT_PASS (pass_feedback_split_functions);
   POP_INSERT_PASSES ()
-  NEXT_PASS (pass_ipa_increase_alignment);
   NEXT_PASS (pass_ipa_tm);
   NEXT_PASS (pass_ipa_lower_emutls);
   TERMINATE_PASS_LIST (all_small_ipa_passes)
 
   INSERT_PASSES_AFTER (all_regular_ipa_passes)
+  NEXT_PASS (pass_ipa_increase_alignment);
   NEXT_PASS (pass_ipa_whole_program_visibility);
   NEXT_PASS (pass_ipa_profile);
   NEXT_PASS (pass_ipa_icf);
diff --git a/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-73.c b/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-73.c
new file mode 100644
index 0000000..74eaed8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-73.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target section_anchors } */
+/* { dg-require-effective-target vect_int } */
+
+#define N 32
+
+/* Clone of section-anchors-vect-70.c with foo() having -fno-tree-loop-vectorize.  */ 
+
+static struct A {
+  int p1, p2;
+  int e[N];
+} a, b, c;
+
+__attribute__((optimize("-fno-tree-loop-vectorize")))
+int foo(void)
+{
+  for (int i = 0; i < N; i++)
+    a.e[i] = b.e[i] + c.e[i];
+
+   return a.e[0];
+}
+
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target arm*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-74.c b/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-74.c
new file mode 100644
index 0000000..574d917
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-74.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target section_anchors } */
+/* { dg-require-effective-target vect_int } */
+
+#define N 32
+
+/* Do not increase alignment of a since it does not have it's
+   address taken and doesn't occur inside a loop.  */
+
+static int a[N];
+
+int foo(void)
+{
+  return a[0];
+}
+
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target arm*-*-* } } } */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 36299a6..d36aa1d 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -483,7 +483,7 @@  extern simple_ipa_opt_pass *make_pass_local_optimization_passes (gcc::context *c
 
 extern ipa_opt_pass_d *make_pass_ipa_whole_program_visibility (gcc::context
 							       *ctxt);
-extern simple_ipa_opt_pass *make_pass_ipa_increase_alignment (gcc::context
+extern ipa_opt_pass_d *make_pass_ipa_increase_alignment (gcc::context
 							      *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_inline (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_free_lang_data (gcc::context *ctxt);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 2669813..f240747 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -75,7 +75,10 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-propagate.h"
 #include "dbgcnt.h"
 #include "tree-scalar-evolution.h"
-
+#include "tree-ssa-loop.h"
+#include "tree-streamer.h"
+#include "stringpool.h"
+#include "gimple-pretty-print.h"
 
 /* Loop or bb location.  */
 source_location vect_location;
@@ -795,7 +798,8 @@  make_pass_slp_vectorize (gcc::context *ctxt)
      array padding.  */
 
 static unsigned get_vec_alignment_for_type (tree);
-static hash_map<tree, unsigned> *type_align_map;
+static hash_map<tree, unsigned> *type_align_map = NULL;
+static hash_set<varpool_node *> *vars = NULL;
 
 /* Return alignment of array's vector type corresponding to scalar type.
    0 if no vector type exists.  */
@@ -899,46 +903,199 @@  get_vec_alignment_for_type (tree type)
   return (alignment > TYPE_ALIGN (type)) ? alignment : 0;
 }
 
+/* Return true if alignment should be increased for this vnode.
+   This is done if every function that referring to vnode
+   has flag_tree_loop_vectorize set.  */ 
+
+static bool
+increase_alignment_p (varpool_node *vnode)
+{
+  ipa_ref *ref;
+  cgraph_node *cnode;
+
+  /* FIXME: Is this really necessary, since we check if cnode has
+     flag_tree_loop_vectorize_set during analysis phase in
+     increase_alignment_analyze_function ?  */
+
+  for (int i = 0; vnode->iterate_referring (i, ref); i++)
+    /* Walk those functions that read/write/take address of vnode.  */
+    if ((cnode = dyn_cast<cgraph_node *> (ref->referring)))
+      if (!opt_for_fn (cnode->decl, flag_tree_loop_vectorize))
+	return false;
+	
+  return true; 
+}
+
+static bool 
+increase_alignment_callback (varpool_node *vnode, void *data ATTRIBUTE_UNUSED)
+{
+  tree decl = vnode->decl;
+
+  if (vnode->section_anchor
+      && decl_in_symtab_p (decl)
+      && symtab_node::get (decl)->can_increase_alignment_p ()
+      && !DECL_USER_ALIGN (decl)
+      && increase_alignment_p (vnode))
+    {
+      unsigned alignment = get_vec_alignment_for_type (TREE_TYPE (decl));
+      if (alignment && vect_can_force_dr_alignment_p (decl, alignment))
+	{
+	  vnode->increase_alignment (alignment);
+	  dump_printf (MSG_NOTE, "Increasing alignment of decl: ");
+	  dump_generic_expr (MSG_NOTE, TDF_SLIM, decl);
+	  dump_printf (MSG_NOTE, "\n");
+	}
+    }
+
+  return true;
+}
+
 /* Entry point to increase_alignment pass.  */
 static unsigned int
 increase_alignment (void)
 {
-  varpool_node *vnode;
-
   vect_location = UNKNOWN_LOCATION;
   type_align_map = new hash_map<tree, unsigned>;
 
   /* Increase the alignment of all global arrays for vectorization.  */
-  FOR_EACH_DEFINED_VARIABLE (vnode)
+  for (hash_set<varpool_node *>::iterator it = vars->begin (); it != vars->end (); ++it)
     {
-      tree decl = vnode->decl;
-      unsigned int alignment;
-
-      if ((decl_in_symtab_p (decl)
-	  && !symtab_node::get (decl)->can_increase_alignment_p ())
-	  || DECL_USER_ALIGN (decl) || DECL_ARTIFICIAL (decl))
-	continue;
-
-      alignment = get_vec_alignment_for_type (TREE_TYPE (decl));
-      if (alignment && vect_can_force_dr_alignment_p (decl, alignment))
-        {
-	  vnode->increase_alignment (alignment);
-          dump_printf (MSG_NOTE, "Increasing alignment of decl: ");
-          dump_generic_expr (MSG_NOTE, TDF_SLIM, decl);
-          dump_printf (MSG_NOTE, "\n");
-        }
+      varpool_node *vnode = *it;
+      vnode->call_for_symbol_and_aliases (increase_alignment_callback, NULL, true);
     }
 
   delete type_align_map;
+  delete vars;
   return 0;
 }
 
+static void
+increase_alignment_analyze_function (cgraph_node *cnode)
+{
+  ipa_ref *ref;
+
+  for (int i = 0; cnode->iterate_reference (i, ref); i++)
+    if (varpool_node *vnode = dyn_cast<varpool_node *> (ref->referred))
+      {
+	if (vars->contains (vnode))
+	  return; 
+
+	tree decl = vnode->decl;
+	if (! (TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE
+	       || TREE_CODE (TREE_TYPE (decl)) == RECORD_TYPE)
+	       && !DECL_ARTIFICIAL (decl))
+	  continue;
+
+	if (ref->use == IPA_REF_ADDR)
+	  vars->add (vnode);
+	else
+	  {
+	    struct loop *l = loop_containing_stmt (ref->stmt);
+	    if (l != DECL_STRUCT_FUNCTION (cnode->decl)->x_current_loops->tree_root)
+	      vars->add (vnode);
+	  }
+      }
+}
+
+static void
+increase_alignment_generate_summary (void)
+{
+  vars = new hash_set<varpool_node *> ();
+  cgraph_node *cnode;
+
+  FOR_EACH_DEFINED_FUNCTION (cnode)
+    if (opt_for_fn (cnode->decl, flag_tree_loop_optimize))
+      increase_alignment_analyze_function (cnode);
+}
+
+static void
+increase_alignment_write_summary (void)
+{
+  /* Following is streamed to the section:
+   a) count of type uhwi, representing number of decls.
+   b) the decls.  */
+
+  struct output_block *ob = create_output_block (LTO_section_increase_alignment);
+  streamer_write_uhwi (ob, vars->elements ()); 
+
+  for (hash_set<varpool_node *>::iterator it = vars->begin (); it != vars->end (); ++it)
+    {
+      varpool_node *vnode = *it;
+      stream_write_tree (ob, vnode->decl, true);
+    }
+
+  produce_asm (ob, NULL);
+  destroy_output_block (ob);
+
+  /* FIXME: This is a workaround for segfault when flag_fat_lto_objects gets
+     enabled either directly or by -fno-use-linker-plugin. 
+     With flag_fat_lto_objects, execute() is called also during analysis
+     stage, after write_summary(). So if we delete vars here, we segfault
+     during execute() since it requires vars. I am not sure what's a good
+     solution to workaround the issue.  */
+
+  if (!global_options.x_flag_fat_lto_objects)
+    delete vars;
+}
+
+static void
+increase_alignment_read_section (struct lto_file_decl_data *file_data,
+				 const char *data, size_t len)
+{
+  const struct lto_function_header *header =
+    (const struct lto_function_header *) data;
+
+  const int cfg_offset = sizeof (struct lto_function_header);
+  const int main_offset = cfg_offset + header->cfg_size;
+  const int string_offset = main_offset + header->main_size;
+
+  lto_input_block ib (data + main_offset, header->main_size,
+		      file_data->mode_table);
+
+  struct data_in *data_in =
+    lto_data_in_create (file_data, data + string_offset,
+			header->string_size, vNULL); 
+
+  unsigned HOST_WIDE_INT count = streamer_read_uhwi (&ib);
+
+  for (unsigned HOST_WIDE_INT i = 0; i < count; i++)
+    {
+      tree decl = stream_read_tree (&ib, data_in);
+      varpool_node *vnode = varpool_node::get_create (decl);
+      if (vnode)
+	vars->add (vnode);
+    } 
+
+  lto_free_section_data (file_data, LTO_section_increase_alignment, NULL,
+			 data, len);
+
+  lto_data_in_delete (data_in);
+}
+
+static void
+increase_alignment_read_summary (void)
+{
+  struct lto_file_decl_data **file_data_vec = lto_get_file_decl_data ();
+  struct lto_file_decl_data *file_data;
+  unsigned i = 0;
+
+  vars = new hash_set<varpool_node *> ();
+
+  while ((file_data = file_data_vec[i++]))
+    {
+      size_t len;
+      const char *data = lto_get_section_data (file_data, LTO_section_increase_alignment, NULL, &len);
+
+      if (data)
+	increase_alignment_read_section (file_data, data, len);
+    }
+}
 
 namespace {
 
 const pass_data pass_data_ipa_increase_alignment =
 {
-  SIMPLE_IPA_PASS, /* type */
+  IPA_PASS, /* type */
   "increase_alignment", /* name */
   OPTGROUP_LOOP | OPTGROUP_VEC, /* optinfo_flags */
   TV_IPA_OPT, /* tv_id */
@@ -949,17 +1106,26 @@  const pass_data pass_data_ipa_increase_alignment =
   0, /* todo_flags_finish */
 };
 
-class pass_ipa_increase_alignment : public simple_ipa_opt_pass
+class pass_ipa_increase_alignment : public ipa_opt_pass_d
 {
 public:
   pass_ipa_increase_alignment (gcc::context *ctxt)
-    : simple_ipa_opt_pass (pass_data_ipa_increase_alignment, ctxt)
+    : ipa_opt_pass_d (pass_data_ipa_increase_alignment, ctxt,
+		      increase_alignment_generate_summary, 
+		      increase_alignment_write_summary,	
+		      increase_alignment_read_summary,	
+			   NULL, /* write optimization summary  */
+			   NULL, /* read optimization summary  */
+			   NULL, /* stmt fixup  */
+			   0, /* function_transform_todo_flags_start  */
+			   NULL, /* transform function  */
+			   NULL )/* variable transform  */
   {}
 
   /* opt_pass methods: */
   virtual bool gate (function *)
     {
-      return flag_section_anchors && flag_tree_loop_vectorize;
+      return flag_ipa_increase_alignment != 0; 
     }
 
   virtual unsigned int execute (function *) { return increase_alignment (); }
@@ -968,7 +1133,7 @@  public:
 
 } // anon namespace
 
-simple_ipa_opt_pass *
+ipa_opt_pass_d *
 make_pass_ipa_increase_alignment (gcc::context *ctxt)
 {
   return new pass_ipa_increase_alignment (ctxt);