@InProceedings{Alahmadi2018,
author = {Alahmadi, Mohammad and Hassel, Jonathan, and Biswas,
Parajuli and Haiduc, Sonia and Kumar, Piyush.},
title = {{Accurately Predicting the Location of Code
Fragments in Programming Video Tutorials Using Deep Learning}},
booktitle = {Proceedings of the 14th {International} {Conference}
on {Predictive} {Models} and {Data} {Analytics} in {Software}
{Engineering} ({PROMISE}'18), {Technical} {Research} {Track}},
year = {2018},
pages = {to appear (12 pages)},
address = {Oulu, Finland},
month = oct,
publisher = {Springer},
acceptancerate = {21.2},
timestamp = {2018-07-10},
}
@InProceedings{Mills2018a,
author = {Mills, Chris and Escobar-Avila, Javier and Haiduc,
Sonia},
title = {{Automatic} {Traceability} {Maintenance} via
{Machine} {Learning} {Classification}},
booktitle = {Proceedings of the 34th {IEEE} {International}
{Conference} on {Software} {Maintenance} and {Evolution} ({ICSME}'18),
{Technical} {Research} {Track}},
year = {2018},
pages = {to appear (12 pages)},
address = {Madrid, Spain},
month = sep,
publisher = {ACM},
acceptancerate = {21.2},
pdf =
{http://www.cs.fsu.edu/~serene/mills_icsme_18_trace/},
timestamp = {2018-07-10},
}
@InProceedings{Mills2018b,
author = {Mills, Chris and Pantiuchina, Jevgenija and Parra,
Esteban and Bavota, Gabriele and Haiduc, Sonia},
title = {{Are} {Bug} {Reports} {Enough} for {Text}
{Retrieval-based} {Bug} {Localization?}},
booktitle = {Proceedings of the 34th {IEEE} {International}
{Conference} on {Software} {Maintenance} and {Evolution} ({ICSME}'18),
{Technical} {Research} {Track}},
year = {2018},
pages = {to appear (12 pages)},
address = {Madrid, Spain},
month = sep,
publisher = {ACM},
acceptancerate = {21.2},
pdf = {http://www.cs.fsu.edu/~serene/mills_icsme_18_bugs/},
timestamp = {2018-07-10},
}
@InProceedings{Parra2018,
author = {Parra, Esteban and Escobar-Avila, Javier and Haiduc,
Sonia},
title = {Automatic {Tagging} for {Software} {Engineering}
{Videos}},
booktitle = {Proceedings of the 26th {ACM}/{IEEE} {International}
{Conference} on {Program} {Comprehension} ({ICPC}'18), {Technical}
{Research} {Track}},
year = {2018},
pages = {222 -- 232},
address = {Gothenburg, Sweden},
month = {May 27-28},
publisher = {ACM},
acceptancerate = {37.6},
doi = {10.1145/3196321.3196351},
keywords = {Software engineering; video tutorials; tagging; text
retrieval},
pdf = {http://www.cs.fsu.edu/~serene/parra-icpc2018/},
timestamp = {2018-08-12},
}
@Article{Ponzanelli2018c,
author = {Ponzanelli, L. and Bavota, G. and Mocci, A. and Oliveto, R.
and Di Penta, M. and Haiduc, Sonia and Russo, B. and Lanza, M.},
title = {{Automatic} {Identification} and {Classification} of
{Software} {Development} {Video} {Tutorial} {Fragments}},
journal = {{IEEE} {Transactions} on {Software} {Engineering} ({TSE})},
year = {2018},
volume = {(accepted for publication)},
}
@InProceedings{Escobar-Avila2017,
author = {Escobar-Avila, Javier and Parra, Esteban and Haiduc,
Sonia},
title = {Text {Retrieval-based} {Tagging} of {Software}
{Engineering} {Video} {Tutorials}},
booktitle = {Proceedings of the 39th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'17), {Poster} {Track}},
year = {2017},
pages = {341-343},
address = {Buenos Aires, Argentina},
month = {May 20-28},
publisher = {ACM},
abstract = {Video tutorials are an emerging form of
documentation in software engineering and can efficiently provide
developers with useful information needed for their daily tasks.
However, to get the information they need, developers have to find the
right tutorial for their task at hand. Currently, there is little
information available to quickly judge whether a tutorial
is relevant to a topic or helpful for task at hand, which can
lead to missing the best tutorials and wasting time watching
irrelevant ones. We present the first efforts towards new tagging
approaches using text retrieval that describe the contents of
software engineering video tutorials, making it easier and faster
to understand their purpose and contents. We also present the
results of a preliminary evaluation of thirteen such approaches,
revealing the potential of some and limitations of others. Our
future work will focus on improving on the promising approaches
determined in this preliminary study and supplementing them
with additional information.},
acceptancerate = {57.8},
doi = {10.1109/ICSE-C.2017.121},
keywords = {Software engineering; video tutorials; tagging; text
retrieval},
timestamp = {2017.02.12},
}
@InProceedings{Mills2017a,
author = {Mills, Chris and Haiduc, Sonia},
title = {A {Machine} {Learning} {Approach} for {Determining} the
{Validity} of {Traceability} {Links}},
booktitle = {Proceedings of the 39th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'17), {Poster} {Track}},
year = {2017},
pages = {121-123},
address = {Buenos Aires, Argentina},
month = {May 20-28},
publisher = {ACM},
doi = {10.1109/ICSE-C.2017.86},
timestamp = {2017.02.12},
}
@InProceedings{Mills2017b,
author = {Mills, Chris and Haiduc, Sonia},
title = {The {Impact} of {Retrieval} {Direction} on
{IR}-based {Traceability} {Link} {Recovery}},
booktitle = {Proceedings of the 39th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'17), {NIER} {Track}},
year = {2017},
pages = {51-54},
address = {Buenos Aires, Argentina},
month = {May 20-28},
publisher = {ACM},
acceptancerate = {16},
doi = {10.1109/ICSE-NIER.2017.14},
timestamp = {2017.02.12},
}
@Article{Mills2017e,
author = {Chris Mills and Gabriele Bavota and Sonia Haiduc and
Rocco Oliveto and Andrian Marcus and Andrea De Lucia},
title = {Predicting {Query} {Quality} for {Applications} of {Text}
{Retrieval} to {Software} {Engineering} {Tasks}},
journal = {{ACM} {Transactions} on {Software} {Engineering} and
{Methodology} ({TOSEM})},
year = {2017},
volume = {26},
number = {1},
pages = {1-45},
month = jul,
doi = {10.1145/3078841},
timestamp = {Mon, 31 Jul 2017 13:11:39 +0200},
}
@InProceedings{Haiduc2016,
author = {Haiduc, Sonia and Arnaoudova, Venera and Marcus,
Adrian and Antoniol, G.},
title = {The {Use} of {Text} {Retrieval} and {Natural}
{Language} {Processing} in {Software} {Engineering}},
booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'16), {Technical}
{Briefings}},
year = {2016},
pages = {898-899},
address = {Austin, TX, USA},
publisher = {ACM},
abstract = {This technical briefing presents the state of the
art Text Retrieval and Natural Language Processing techniques used in
Software Engineering and discusses their applications in the field.},
acceptancerate = {41.9},
doi = {10.1145/2889160.2891053},
keywords = {http://dx.doi.org/10.1109/ICSE.2015.301},
timestamp = {2016.05.05},
}
@InProceedings{Imminni2016,
author = {Imminni, S. and Hasan, M. and Duckett, M. and
Sachdeva, P. and Karmakar, S. and Kumar, P. and Haiduc, Sonia},
title = {{SPYSE}: A {Semantic} {Search} {Engine} for {Python}
{Packages} and {Modules}},
booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'16), {Tool} {Demo}
{Track}},
year = {2016},
pages = {625-628},
address = {Austin, TX, USA},
publisher = {ACM},
abstract = {Code reuse is a common practice among software
developers, whether novices or experts. Developers often rely on online
resources in order to find code to reuse. For Python, the Python Package
Index (PyPI) contains all packages developed for the community and is
the largest catalog of reusable, open source packages developers can
consult. While a valuable resource, the state of the art PyPI search has
very limited capabilities, making it hard for developers to find useful,
high quality Python code to use for their task at hand.
We introduce SPYSE (Semantic PYthon Search Engine), a web-based search
engine that overcomes the limitations of the state of the art, making it
easier for developers to find useful code. The power of SPYSE lays in
the combination of three different aspects meant to provide developers
with relevant, and at the same time high quality code: code semantics,
popularity, and code quality. SPYSE also allows searching for modules,
in addition to packages, which opens new reuse opportunities for
developers, currently not supported. TOOL URL: https://pypi.compgeom.com
VIDEO URL: https://youtu.be/Praglw-vS50},
acceptancerate = {31},
doi = {10.1145/2889160.2889174},
timestamp = {2016.05.05},
}
@InProceedings{Parra2016,
author = {Parra, Esteban and Haiduc, Sonia and James,
Rebecca},
title = {Making a {Difference}: {An} {Overview} of
{Humanitarian} {Free} {Open} {Source} {Systems}},
booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'16), {Poster} {Track}},
year = {2016},
pages = {731-733},
address = {Austin, TX, USA},
publisher = {ACM},
abstract = {Humanitarian Free Open Source Software (HFOSS)
serves philanthropic goals that usually benefit non-profit organizations
meant to improve the human condition. The altruistic goals these systems
serve can offer developers additional motivation for contributing to OSS
and have been seen as a way to attract more women to computing majors
and to improve students' learning.
We present an exploratory study of the currently existing HFOSS
projects, aimed at giving an overview of their properties, including the
most common application domains and the most popular programming
languages used in this kind of systems. We also investigated the
assumption that HFOSS systems attract more women developers and found
the assumption to be incorrect.},
acceptancerate = {57.8},
doi = {10.1145/2889160.2892651},
timestamp = {2016.05.05},
}
@InProceedings{Ponzanelli2016b,
author = {Ponzanelli, L. and Bavota, G. and Mocci, A. and Di
Penta, M. and Oliveto, R. and Hasan, M. and Russo, B. and Haiduc, Sonia
and Lanza, M.},
title = {Too {Long}; {Didn't} {Watch}! {Extracting}
{Relevant} {Fragments} from {Software} {Development} {Video}
{Tutorials}, {Technical} {Research} {Track}},
booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'16)},
year = {2016},
pages = {261-272},
address = {Austin, TX, USA},
publisher = {ACM},
abstract = {When knowledgeable colleagues are not available,
developers resort to offline and online resources, e.g., tutorials,
mailing lists, and Q&A websites. These, however, need to be found, read,
and understood, which takes its toll in terms of time and mental energy.
A more immediate and accessible resource are video tutorials found on
the web, which in recent years have seen a steep increase in popularity.
Nonetheless, videos are an intrinsically noisy data source, and finding
the right piece of information might be even more cumbersome than using
the previously mentioned resources.
We present CodeTube, an approach which mines video tutorials found on
the web, and enables developers to query their contents. The video
tutorials are split into coherent fragments, to return only fragments
related to the query. These are complemented with information from
additional sources, such as Stack Overflow discussions. The results of
two studies to assess CodeTube indicate that video tutorials---if
appropriately processed---represent a useful, yet still under-utilized
source of information for software development.},
acceptancerate = {19},
doi = {10.1145/2884781.2884824},
timestamp = {2016.05.05},
}
@InProceedings{Ponzanelli2016,
author = {Ponzanelli, L. and Bavota, G. and Mocci, A. and Di
Penta, M. Oliveto, and R. Russo, B. and Haiduc, Sonia and Lanza, M.},
title = {{CodeTube}: {Extracting} {Relevant} {Fragments} from
{Software} {Development} {Video} {Tutorials}},
booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'16), {Tool} {Demo}
{Track}},
year = {2016},
pages = {645-648},
address = {Austin, TX, USA},
publisher = {ACM},
abstract = {Nowadays developers heavily rely on sources of
informal documentation, including Q&A forums, slides, or video
tutorials, the latter being particularly useful to provide introductory
notions for a piece of technology. The current practice is that
developers have to browse sources individually, which in the case of
video tutorials is cumbersome, as they are lengthy and cannot be
searched based on their contents.
We present CodeTube, a Web-based recommender system that analyzes the
contents of video tutorials and is able to provide, given a query,
cohesive and self-contained video fragments, along with links to
relevant Stack Overflow discussions. CodeTube relies on a combination of
textual analysis and image processing applied on video tutorial frames
and speech transcripts to split videos into cohesive fragments, index
them and identify related Stack Overflow discussions.
DEMO URL: http://codetube.inf.usi.ch
VIDEO URL: https://youtu.be/yUsUG3g87Dg},
acceptancerate = {31},
doi = {10.1145/2889160.2889172},
timestamp = {2016.05.05},
}
@InProceedings{Arnaoudova2015,
author = {Arnaoudova, V. and Haiduc, S. and Marcus, A. and
Antoniol, G.},
title = {The {Use} of {Text} {Retrieval} and {Natural} {Language}
{Processing} in {Software} {Engineering}},
booktitle = {{ACM} {Conference} on {Systems}, {Programming},
{Languages} and {Applications}: {Software} for {Humanity} ({SPLASH}'15),
{Tutorials}},
year = {2015},
address = {Pittsburgh, PA, USA},
month = oct,
keywords = {information retrieval, Software engineering},
owner = {USER},
timestamp = {2015.09.18},
}
@InProceedings{Arnaoudova2015a,
author = {Arnaoudova, V. and Haiduc, S. and Marcus, A. and
Antoniol, G.},
title = {The {Use} of {Text} {Retrieval} and {Natural} {Language}
{Processing} in {Software} {Engineering}},
booktitle = {Proceedings of the 10th {Joint} {Meeting} of the
{European} {Software} {Engineering} {Conference} and the {ACM/SIGSOFT}
{Symposium} on the {Foundations} of {Software} {Engineering}
({ESEC/FSE}'15), {Tutorials}},
year = {2015},
address = {Bergamo, Italy},
month = oct,
keywords = {information retrieval, Software engineering},
owner = {USER},
timestamp = {2015.09.18},
}
@InProceedings{Arnaoudova2015b,
author = {Arnaoudova, V. and Haiduc, S. and Marcus, A. and
Antoniol, G.},
title = {The {Use} of {Text} {Retrieval} and {Natural}
{Language} {Processing} in {Software} {Engineering}},
booktitle = {Proceedings of the 37th {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'15), {Technical}
{Briefings}},
year = {2015},
pages = {949-950},
address = {Florence, Italy},
month = may,
acceptancerate = {40},
doi = {10.1109/ICSE.2015.301},
keywords = {information retrieval, Software engineering},
timestamp = {2015.09.18},
}
@InProceedings{Escobar-Avila2015b,
author = {Escobar-Avila, Javier and Linares-Vasquez, Mario and
Haiduc, Sonia},
title = {Unsupervised {Software} {Categorization} {Using}
{Bytecode}},
booktitle = {Proceedings of the 23rd {IEEE} {International}
{Conference} on {Program} {Comprehension} ({ICPC}), {Technical}
{Research} {Track}},
year = {2015},
pages = {229--239},
address = {Florence, Italy},
month = may,
abstract = {Video tutorials are an emerging form of
documentation in software engineering and can efficiently provide
developers with useful information needed for their daily tasks.
However, to get the information they need, developers have to find the
right tutorial for their task at hand. Currently, there is little
information available to quickly judge whether a tutorial is relevant to
a topic or helpful for task at hand, which can lead to missing the best
tutorials and wasting time watching irrelevant ones.
We present the first efforts towards new tagging approaches using text
retrieval that describe the contents of software engineering video
tutorials, making it easier and faster to understand their purpose and
contents. We also present the results of a preliminary evaluation of
thirteen such approaches, revealing the potential of some and
limitations of others. Our future work will focus on improving on the
promising approaches determined in this preliminary study and
supplementing them with additional information.},
acceptancerate = {31.5},
doi = {10.1109/ICPC.2015.33},
keywords = {Accuracy, bytecode, clustering, Clustering
algorithms, Data mining, dirichlet process, Java, Software, software
categorization, Software libraries, software profiles},
timestamp = {2016.08.31},
}
@Conference{MUD2015,
author = {Haiduc, S. and Bavota, G.},
title = {5th {Workshop on Mining Unstructured Data}},
booktitle = {{Proceedings of the 31st International Conference on
Software Maintenance and Evolution} ({ICSME}’15)},
year = {2015},
pages = {1-6},
address = {Bremen, Germany},
}
@InProceedings{Moreno2015,
author = {Moreno, Laura and Bavota, Gabriele and Haiduc, Sonia
and Di Penta, Massimiliano and Oliveto, Rocco and Russo, Barbara and
Marcus, Andrian},
title = {Query-based {Configuration} of {Text} {Retrieval}
{Solutions} for {Software} {Engineering} {Tasks}},
booktitle = {Proceedings of the 10th {Joint} {Meeting} of the
{European} {Software} {Engineering} {Conference} and the {ACM/SIGSOFT}
{Symposium} on the {Foundations} of {Software} {Engineering}
({ESEC/FSE}'15), {Technical} {Research} {Track}},
year = {2015},
series = {{ESEC}/{FSE} 2015},
pages = {567--578},
address = {Bergamo, Italy},
publisher = {ACM},
abstract = {Text Retrieval (TR) approaches have been used to
leverage the textual information contained in software artifacts to
address a multitude of software engineering (SE) tasks. However, TR
approaches need to be configured properly in order to lead to good
results. Current approaches for automatic TR configuration in SE
configure a single TR approach and then use it for all possible queries.
In this paper, we show that such a configuration strategy leads to
suboptimal results, and propose QUEST, the first approach bringing TR
configuration selection to the query level. QUEST recommends the best TR
configuration for a given query, based on a supervised learning approach
that determines the TR configuration that performs the best for each
query according to its properties. We evaluated QUEST in the context of
feature and bug localization, using a data set with more than 1,000
queries. We found that QUEST is able to recommend one of the top three
TR configurations for a query with a 69\% accuracy, on average. We
compared the results obtained with the configurations recommended by
QUEST for every query with those obtained using a single TR
configuration for all queries in a system and in the entire data set. We
found that using QUEST we obtain better results than with any of the
considered TR configurations.},
acceptancerate = {25.4},
doi = {10.1145/2786805.2786859},
isbn = {978-1-4503-3675-8},
keywords = {configuration, Feature and Bug Localization,
Text-Retrieval in Software Engineering},
timestamp = {2016.08.31},
urldate = {2015-09-18},
}
@Conference{MUD2014,
author = {Bacchelli, A. and Haiduc, S.},
title = {4th {Workshop on Mining Unstructured Data}},
booktitle = {{Proceedings of the 30th International Conference on
Software Maintenance and Evolution} ({ICSME}’14)},
year = {2014},
address = {Victoria, BC, Canada},
}
@InProceedings{Haiduc2014,
author = {Haiduc, S.},
title = {Supporting {Query} {Formulation} for {Text} {Retrieval}
{Applications} in {Software} {Engineering}},
booktitle = {Proceedings of the 30th {IEEE} {International}
{Conference} on {Software} {Maintenance} and {Evolution} ({ICSME}'14),
{Doctoral} {Symposium}},
year = {2014},
pages = {657--662},
address = {Victoria, Canada},
month = sep,
abstract = {Text Retrieval (TR) techniques have been successfully
used to leverage the textual information found in software artifacts
with the purpose of aiding developers with their daily tasks. TR
techniques require a query as input and the usefulness of the results
they retrieve depends greatly on this query. While some queries retrieve
relevant information for the current task, others do not, therefore
pointing developers in the wrong direction. Developers have a hard time
realizing this before going through the search results, which, in the
case of "bad" queries means time and effort lost looking at irrelevant
information. In this scenario, developers have to reformulate the query,
often without pointers on how to improve it. The work presented in this
paper introduces novel approaches to address these challenges and makes
two main contributions: 1) defines the first approach for predicting the
success of a TR query in the context of SE tasks, 2) introduces
automatic approaches that analyze a query and improve it by finding the
most suited reformulation for it. The approaches were evaluated for the
task of concept location in source code and the results of the performed
studies reveal their usefulness.},
doi = {10.1109/ICSME.2014.117},
keywords = {Conferences, query, query formulation, Query Quality,
Query Reformulation, relevant information retrieval, software artifacts,
Software engineering, software maintenance, source code, text analysis,
text retrieval, text retrieval applications, textual information, TR
query prediction, TR techniques},
timestamp = {2016.08.31},
}
@Conference{wcre2013,
author = {Bacchelli, A. and Bettenburg, N. and Guerrouj, L. and
Haiduc, S.},
title = {3rd {Workshop} on {Mining} {Unstructured} {Data}},
booktitle = {{20th Working Conference on Reverse Engineering}
({WCRE}'13)},
year = {2013},
pages = {491-492},
address = {Koblenz, Germany},
month = oct,
}
@InProceedings{Haiduc2013,
author = {Haiduc, S. and Bavota, G. and Marcus, A. and
Oliveto, R. and De Lucia, A. and Menzies, T.},
title = {{Automatic} {Query} {Reformulations} for {Text}
{Retrieval} in {Software} {Engineering}},
booktitle = {Proceedings of the 35th {International} {Conference}
on {Software} {Engineering} ({ICSE}'13) {Technical} {Research} {Track}},
year = {2013},
pages = {842--851},
address = {San Francisco, CA, USA},
month = may,
abstract = {There are more than twenty distinct software
engineering tasks addressed with text retrieval (TR) techniques, such
as, traceability link recovery, feature location, refactoring, reuse,
etc. A common issue with all TR applications is that the results of the
retrieval depend largely on the quality of the query. When a query
performs poorly, it has to be reformulated and this is a difficult task
for someone who had trouble writing a good query in the first place. We
propose a recommender (called Refoqus) based on machine learning, which
is trained with a sample of queries and relevant results. Then, for a
given query, it automatically recommends a reformulation strategy that
should improve its performance, based on the properties of the query. We
evaluated Refoqus empirically against four baseline approaches that are
used in natural language document retrieval. The data used for the
evaluation corresponds to changes from five open source systems in Java
and C++ and it is used in the context of TR-based concept location in
source code. Refoqus outperformed the baselines and its recommendations
lead to query performance improvement or preservation in 84\% of the
cases (in average).},
acceptancerate = {18.5},
doi = {10.1109/ICSE.2013.6606630},
keywords = {automatic query reformulation, C++, C++ language,
Context, Engines, feature location, Frequency measurement, Java,
learning (artificial intelligence), machine learning, Natural languages,
query formulation, Query Reformulation, recommender system, Recommender
Systems, Refoqus, Robustness, Software engineering, text retrieval,
Traceability Link Recovery, Training, Training data},
timestamp = {2016.08.31},
}
@InProceedings{Haiduc2013a,
author = {Haiduc, S. and De Rosa, G. and Bavota, G. and
Oliveto, R. and De Lucia, A. and Marcus, A.},
title = {{Query} {Quality} {Prediction} and {Reformulation}
for {Source} {Code} {Search}: {The} {Refoqus} {Tool}},
booktitle = {Proceedings of the 35th {International} {Conference}
on {Software} {Engineering} ({ICSE}'13), {Formal} {Demonstrations}
{Track}},
year = {2013},
pages = {1307--1310},
address = {San Francisco, CA, USA},
month = may,
abstract = {Developers search source code frequently during
their daily tasks, to find pieces of code to reuse, to find where to
implement changes, etc. Code search based on text retrieval (TR)
techniques has been widely used in the software engineering community
during the past decade. The accuracy of the TR-based search results
depends largely on the quality of the query used. We introduce Refoqus,
an Eclipse plugin which is able to automatically detect the quality of a
text retrieval query and to propose reformulations for it, when needed,
in order to improve the results of TR-based code search. A video of
Refoqus is found online at http://www.youtube.com/watch?v=UQlWGiauyk4.},
acceptancerate = {31},
doi = {10.1109/ICSE.2013.6606704},
keywords = {Context, Eclipse plugin, Feature extraction, query
formulation, Query Quality, query quality prediction, Query
Reformulation, Refoqus tool, Software engineering, software maintenance,
Software systems, source code search, text retrieval, text retrieval
query, text retrieval technique, Training, Training data, TR-based
search},
shorttitle = {Query quality prediction and reformulation for
source code search},
timestamp = {2016.08.31},
}
@Article{Marcus2013,
author = {Marcus, Andrian and Haiduc, Sonia},
title = {Text {Retrieval} {Approaches} for {Concept} {Location} in
{Source} {Code}},
journal = {in {Series} {Lecture} {Notes} in {Computer} {Science}},
year = {2013},
volume = {7171},
pages = {126--158},
booktitle = {Software {Engineering}},
copyright = {2013 Springer-Verlag Berlin Heidelberg},
editor = {Lucia, Andrea De and Ferrucci, Filomena},
isbn = {978-3-642-36053-4 978-3-642-36054-1},
keywords = {Concept location, Concern location, feature location,
information retrieval, Information Storage and Retrieval, Information
Systems Applications (incl. Internet), Management of Computing and
Information Systems, Programming Languages, Compilers, Interpreters,
Programming Techniques, Software engineering, software maintenance},
language = {en},
owner = {USER},
publisher = {Springer Berlin Heidelberg},
series = {Lecture {Notes} in {Computer} {Science}},
timestamp = {2016.08.31},
url =
{http://link.springer.com/chapter/10.1007/978-3-642-36054-1_5},
urldate = {2015-09-18},
}
@InProceedings{Moreno2013,
author = {Moreno, L. and Bandara, W. and Haiduc, S. and
Marcus, A.},
title = {On the {Vocabulary} {Relationship} {Between} {Bug}
{Reports} and {Source} {Code}},
booktitle = {Proceedings of the 29th {IEEE} {International}
{Conference} on {Software} {Maintenance} ({ICSM}'13), {Early} {Research}
{Achievement} {Track} ({ERA})},
year = {2013},
pages = {452-455},
address = {Eindhoven, The Netherlands},
month = {Sept},
acceptancerate = {41.4},
doi = {10.1109/ICSM.2013.70},
issn = {1063-6773},
keywords = {information retrieval;program debugging;text
analysis;TR techniques;bug descriptions;bug location techniques;bug
reports;source code;text retrieval techniques;Art;Computer bugs;Data
collection;Large scale integration;Software systems;Vocabulary;Bug
location;source code vocabulary;text retrieval},
owner = {USER},
timestamp = {2015.09.18},
}
@InProceedings{Haiduc2012,
author = {Haiduc, S. and Bavota, G. and Oliveto, R. and De
Lucia, A. and Marcus, A.},
title = {{Automatic} {Query} {Performance} {Assessment}
{During} the {Retrieval} of {Software} {Artifacts}},
booktitle = {Proceedings of the 27th {IEEE}/{ACM} {International}
{Conference} on {Automated} {Software} {Engineering} ({ASE}'12),
{Technical} {Research} {Track}},
year = {2012},
pages = {90--99},
address = {Essen, Germany},
month = sep,
abstract = {Text-based search and retrieval is used by
developers in the context of many SE tasks, such as, concept location,
traceability link retrieval, reuse, impact analysis, etc. Solutions for
software text search range from regular expression matching to complex
techniques using text retrieval. In all cases, the results of a search
depend on the query formulated by the developer. A developer needs to
run a query and look at the results before realizing that it needs
reformulating. Our aim is to automatically assess the performance of a
query before it is executed. We introduce an automatic query performance
assessment approach for software artifact retrieval, which uses 21
measures from the field of text retrieval. We evaluate the approach in
the context of concept location in source code. The evaluation shows
that our approach is able to predict the performance of queries with
79\% accuracy, using very little training data.},
acceptancerate = {15},
doi = {10.1145/2351676.2351690},
keywords = {automatic query performance assessment, Concept
location, Query performance, query processing, regular expression
matching, SE tasks, software artifact retrieval, Software engineering,
software text search, source code, text analysis, text-based retrieval,
text-based search, text retrieval},
timestamp = {2016.08.31},
}
@InProceedings{Haiduc2012a,
author = {Haiduc, S. and Bavota, G. and Oliveto, R. and
Marcus, A. and De Lucia, A.},
title = {{Evaluating} the {Specificity} of {Text} {Retrieval}
{Queries} to {Support} {Software} {Engineering} {Tasks}},
booktitle = {Proceedings of the 34th {International} {Conference}
on {Software} {Engineering} ({ICSE}'12), {NIER} {Track}},
year = {2012},
pages = {1273--1276},
address = {Zurich, Switzerland},
month = jun,
abstract = {Text retrieval approaches have been used to address
many software engineering tasks. In most cases, their use involves
issuing a textual query to retrieve a set of relevant software artifacts
from the system. The performance of all these approaches depends on the
quality of the given query (i.e., its ability to describe the
information need in such a way that the relevant software artifacts are
retrieved during the search). Currently, the only way to tell that a
query failed to lead to the expected software artifacts is by investing
time and effort in analyzing the search results. In addition, it is
often very difficult to ascertain what part of the query leads to poor
results. We propose a novel pre-retrieval metric, which reflects the
quality of a query by measuring the specificity of its terms. We
exemplify the use of the new specificity metric on the task of concept
location in source code. A preliminary empirical study shows that our
metric is a good effort predictor for text retrieval-based concept
location, outperforming existing techniques from the field of natural
language document retrieval.},
acceptancerate = {18},
doi = {10.1109/ICSE.2012.6227101},
keywords = {Concept location, Context, Correlation, Entropy,
information retrieval, Measurement, Natural languages, natural language
text, preretrieval metric, query processing, Query Quality, Query
specificity, Software, software artifacts, software engineering tasks,
software metrics, source code, specificity evaluation, specificity
metric, text analysis, text retrieval, text retrieval-based concept
location, text retrieval queries},
timestamp = {2016.08.31},
}
@InProceedings{w2012,
author = {Haiduc, Sonia and Bavota, G. and Oliveto, R. and Marcus
Andrian and De Lucia, A.,},
title = {{Automatic} {Query} {Quality} {Assessment} for the
{Retrieval} of {Software} {Artifacts}},
booktitle = {{Workshop} on {The} {Next} {Five} {Years} of {Text}
{Analysis} in {Software} {Maintenance}},
year = {2012},
address = {Riva del Garda, Italy},
month = sep,
}
@InProceedings{Abebe2011,
author = {Abebe, S.L. and Haiduc, S. and Tonella, P. and
Marcus, A.},
title = {The {Effect} of {Lexicon} {Bad} {Smells} on
{Concept} {Location} in {Source} {Code}},
booktitle = {Proceedings of the 11th {IEEE} {International}
{Working} {Conference} on {Source} {Code} {Analysis} and {Manipulation}
({SCAM}'11), {Technical} {Research} {Track}},
year = {2011},
pages = {125--134},
address = {Williamsburg, VA, USA},
month = sep,
abstract = {Experienced programmers choose identifier names
carefully, in the attempt to convey information about the role and
behavior of the labeled code entity in a concise and expressive way. In
fact, during program understanding the names given to code entities
represent one of the major sources of information used by developers. We
conjecture that lexicon bad smells, such as, extreme contractions,
inconsistent term use, odd grammatical structure, etc., can hinder the
execution of maintenance tasks which rely on program understanding. We
propose an approach to determine the extent of this impact and
instantiate it on the task of concept location. In particular, we
conducted a study on two open source software systems where we
investigated how lexicon bad smells affect Information Retrieval-based
concept location. In this study, the classes changed in response to past
modification requests are located before and after lexicon bad smells
are identified and removed from the source code. The results indicate
that lexicon bad smells impact concept location when using IR-based
techniques.},
acceptancerate = {31},
doi = {10.1109/SCAM.2011.18},
keywords = {code smells, Computer bugs, Concept location,
Containers, extreme contraction, Filtering, grammars, identifier name,
inconsistent term use, information retrieval, labeled code entity,
lexicon bad smell, lexicon bad smells, Maintenance engineering,
maintenance task execution, modification request, object-oriented
programming, object-oriented software system, odd grammatical structure,
open source software system, program comprehension, program
understanding, public domain software, reverse engineering, software
lexicon, software maintenance, Software systems, source code concept
location, Terminology, text retrieval},
timestamp = {2016.08.31},
}
@InProceedings{Haiduc2011a,
author = {Haiduc, S.},
title = {{Automatically} {Detecting} the {Quality} of the {Query}
and its {Implications} in {IR}-{Based} {Concept} {Location}},
booktitle = {Proceedings of the 26th {IEEE}/{ACM} {International}
{Conference} on {Automated} {Software} {Engineering} ({ASE}'11),
{Doctoral} {Symposium}},
year = {2011},
pages = {637--640},
address = {Lawrence, KS, USA},
month = nov,
abstract = {Concept location is an essential task during software
maintenance and in particular program comprehension activities. One of
the approaches to this task is based on leveraging the lexical
information found in the source code by means of Information Retrieval
techniques. All IR-based approaches to concept location are highly
dependent on the queries written by the users. An IR approach, even
though good on average, might fail when the input query is poor.
Currently there is no way to tell when a query leads to poor results for
IR-based concept location, unless a considerable effort is put into
analyzing the results after the fact. We propose an approach based on
recent advances in the field of IR research, which aims at automatically
determining the difficulty a query poses to an IR-based concept location
technique. We plan to evaluate several models and relate them to IR
performance metrics.},
doi = {10.1109/ASE.2011.6100144},
keywords = {Concept location, Conferences, Correlation, Estimation,
information retrieval, information retrieval-based concept location,
lexical information, Measurement, Prediction algorithms, program
comprehension, program comprehension activities, query, query
processing, Query Quality, search, search engines, software maintenance,
source code},
owner = {USER},
timestamp = {2016.08.31},
}
@InProceedings{Haiduc2011,
author = {Haiduc, S. and Marcus, A.},
title = {On the {Effect} of the {Query} in {IR}-based {Concept}
{Location}},
booktitle = {Proceedings of the 19th {IEEE} {International}
{Conference} on {Program} {Comprehension} ({ICPC}'11), {Student}
{Symposium}},
year = {2011},
pages = {234--237},
address = {Kingston, Canada},
month = jun,
abstract = {Concept location is an essential task during software
maintenance and in particular program comprehension activities. One of
the approaches to this task is the based on leveraging the lexical
information found in the source code by means of Information Retrieval
techniques. All IR-based approaches to concept location are highly
dependent on the queries written by the users. An IR approach, even
though good on average, might fail when the input query is poor.
Currently there is no way to tell when a query leads to poor results for
IR-based concept location, unless a considerable effort is put into
analyzing the results after the fact. We propose an approach based on
recent advances in the field of IR research, which aims at automatically
determining the difficulty a query poses to an IR-based concept location
technique. We plan to evaluate several models and relate them to IR
performance metrics.},
doi = {10.1109/ICPC.2011.48},
keywords = {Concept location, Conferences, Correlation, Estimation,
information retrieval, information retrieval techniques, IR-based
concept location, IR performance metrics, lexical information,
Measurement, Prediction algorithms, program comprehension, program
comprehension activity, query, query processing, search, search engines,
software maintenance, software metrics, software performance evaluation,
source code},
timestamp = {2016.08.31},
}
@InProceedings{Haiduc2010,
author = {Haiduc, S. and Aponte, J. and Marcus, A.},
title = {{Supporting} {Program} {Comprehension} with {Source}
{Code} {Summarization}},
booktitle = {Proceedings of the 32nd {ACM}/{IEEE} {International}
{Conference} on {Software} {Engineering} ({ICSE}'10), {NIER} {Track}},
year = {2010},
volume = {2},
pages = {223--226},
address = {Cape Town, South Africa},
month = may,
abstract = {One of the main challenges faced by today's
developers is keeping up with the staggering amount of source code that
needs to be read and understood. In order to help developers with this
problem and reduce the costs associated with it, one solution is to use
simple textual descriptions of source code entities that developers can
grasp easily, while capturing the code semantics precisely. We propose
an approach to automatically determine such descriptions, based on
automated text summarization technology.},
acceptancerate = {25},
doi = {10.1145/1810295.1810335},
keywords = {automated text summarization, code semantics, cost
reduction, Large scale integration, Natural languages, program
comprehension, reverse engineering, Semantics, software cost estimation,
Software engineering, software maintenance, Software systems, Source
code summarization, summary, Tagging, text summarization, textual
description},
timestamp = {2016.08.31},
}
@InProceedings{Haiduc2010a,
author = {Haiduc, S. and Aponte, J. and Moreno, L. and Marcus,
A.},
title = {On the {Use} of {Automated} {Text} {Summarization}
{Techniques} for {Summarizing} {Source} {Code}},
booktitle = {Proceedings of the 17th {Working} {Conference} on
{Reverse} {Engineering} ({WCRE}'10), {Technical} {Research} {Track}},
year = {2010},
pages = {35--44},
address = {Beverly, MA, USA},
month = oct,
abstract = {During maintenance developers cannot read the entire
code of large systems. They need a way to get a quick understanding of
source code entities (such as, classes, methods, packages, etc.), so
they can efficiently identify and then focus on the ones related to
their task at hand. Sometimes reading just a method header or a class
name does not tell enough about its purpose and meaning, while reading
the entire implementation takes too long. We study a solution which
mitigates the two approaches, i.e., short and accurate textual
descriptions that illustrate the software entities without having to
read the details of the implementation. We create such descriptions
using techniques from automatic text summarization. The paper presents a
study that investigates the suitability of various such techniques for
generating source code summaries. The results indicate that a
combination of text summarization techniques is most appropriate for
source code summarization and that developers generally agree with the
summaries produced.},
acceptancerate = {31},
doi = {10.1109/WCRE.2010.13},
keywords = {automated text summarization technique, Computer
science, Correlation, Large scale integration, Lead, program
comprehension, Semantics, software entity, software maintenance,
Software systems, Source code summarization, text summarization, textual
description},
timestamp = {2016.08.31},
}
@InProceedings{Abebe2009a,
author = {Abebe, S.L. and Haiduc, S. and Marcus, A. and
Tonella, P. and Antoniol, G.},
title = {Analyzing the {Evolution} of the {Source} {Code}
{Vocabulary}},
booktitle = {Proceedings of the 13th {European} {Conference} on
{Software} {Maintenance} and {Reengineering} ({CSMR} '09), {Technical}
{Research} {Track}},
year = {2009},
pages = {189-198},
address = {Kaiserslautern, Germany},
month = {March},
acceptancerate = {30},
doi = {10.1109/CSMR.2009.61},
issn = {1534-5351},
keywords = {software maintenance;programming language
grammar;software artifact;software systems;source code;source code
vocabulary;Computer languages;Computer science;Guidelines;Information
analysis;Knowledge management;Software engineering;Software
maintenance;Software systems;Vocabulary;Writing;Lexicon
evolution;Software vocabulary;Text mining},
timestamp = {2015.09.18},
}
@InProceedings{Abebe2009,
author = {Abebe, S.L. and Haiduc, S. and Tonella, P. and
Marcus, A.},
title = {Lexicon {Bad} {Smells} in {Software}},
booktitle = {Proceedings of the 16th {Working} {Conference} on
{Reverse} {Engineering} ({WCRE} '09), {Technical} {Research} {Track}},
year = {2009},
pages = {95--99},
address = {Little, France},
month = oct,
abstract = {We introduce the notion of "lexicon bad smell",
which parallels that of "code smell" and indicates some potential
lexicon construction problems that can be addressed through refactoring
(e.g., renaming). We created a catalog of lexicon bad smells and we
developed a publicly available suite of detectors to locate them. The
paper presents a case study in which we used the detectors on two
open-source systems. The study revealed the main challenges faced in
detecting the lexicon bad smells.},
acceptancerate = {25},
doi = {10.1109/WCRE.2009.26},
keywords = {Computer science, Detectors, Documentation, Face
detection, lexicon construction problem, Open source software,
open-source system, programming environments, Programming profession,
public domain software, reverse engineering, software lexicon bad smell
catalog, software maintenance, software refactoring, source code smell,
Speech, Terminology},
timestamp = {2016.08.31},
}
@InProceedings{Gay2009,
author = {Gay, G. and Haiduc, S. and Marcus, A. and Menzies,
T.},
title = {On the {Use} of {Relevance} {Feedback} in
{{IR}}-{Based} {Concept} {Location}},
booktitle = {Proceedings of the 25th {IEEE} {International}
{Conference} on {Software} {Maintenance} ({ICSM}'09), {Technical}
{Research} {Track}},
year = {2009},
pages = {351--360},
address = {Edmonton, Canada},
month = sep,
abstract = {Concept location is a critical activity during
software evolution as it produces the location where a change is to
start in response to a modification request, such as, a bug report or a
new feature request. Lexical-based concept location techniques rely on
matching the text embedded in the source code to queries formulated by
the developers. The efficiency of such techniques is strongly dependent
on the ability of the developer to write good queries. We propose an
approach to augment information retrieval (IR) based concept location
via an explicit relevance feedback (RF) mechanism. RF is a two-part
process in which the developer judges existing results returned by a
search and the IR system uses this information to perform a new search,
returning more relevant information to the user. A set of case studies
performed on open source software systems reveals the impact of RF on IR
based concept location.},
acceptancerate = {22},
doi = {10.1109/ICSM.2009.5306315},
keywords = {Computer science, Humans, information retrieval,
Internet, lexical-based concept location, Open source software, query
processing, query writing, Radio frequency, relevance feedback, search
engines, Software engineering, software evolution, software maintenance,
Software systems, State feedback},
timestamp = {2016.08.31},
}
@InProceedings{Haiduc2008,
author = {Haiduc, S. and Marcus, A.},
title = {On the {Use} of {Domain} {Terms} in {Source}
{Code}},
booktitle = {Proceedings of the 16th {IEEE} {International}
{Conference} on {Program} {Comprehension} ({ICPC}'08), {Technical}
{Research} {Track}},
year = {2008},
pages = {113--122},
address = {Amsterdam, The Netherlands},
month = {June},
abstract = {Information about the problem domain of the software
and the solution it implements is often embedded by developers in
comments and identifiers. When using software developed by others or
when are new to a project, programmers know little about how domain
information is reflected in the source code. Programmers often learn
about the domain from external sources such as books, articles, etc.
Hence, it is important to use in comments and identifiers terms that are
commonly known in the domain literature, as it is likely that
programmers will use such terms when searching the source code. The
paper presents a case study that investigated how domain terms are used
in comments and identifiers. The study focused on three research
questions: (1) to what degree are domain terms found in the source code
of software from a particular problem domain?; (2) which is the
preponderant source of domain terms: identifiers or comments?; and (3)
to what degree are domain terms shared between several systems from the
same problem domain? Within the studied software, we found that in
average: 42\% of the domain terms were used in the source code; 23\% of
the domain terms used in the source code are present in comments only,
whereas only 11\% in the identifiers alone, and there is a 63\%
agreement in the use of domain terms between any two software systems.},
acceptancerate = {35},
doi = {10.1109/ICPC.2008.29},
keywords = {Books, comments, Computer science, domain terms,
Embedded software, Graphics, Graph theory, identifiers, Open source
software, Programming profession, software development, Software
engineering, Software libraries, Software systems, source code,
Vocabulary},
timestamp = {2016.08.31},
}